[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 2/2] target/i386: Implement PCLMULQDQ using AArch64 PMULL instruc
From: |
Ard Biesheuvel |
Subject: |
[PATCH 2/2] target/i386: Implement PCLMULQDQ using AArch64 PMULL instructions |
Date: |
Thu, 1 Jun 2023 14:33:32 +0200 |
Use the AArch64 PMULL{2}.P64 instructions to implement PCLMULQDQ instead
of emulating them in C code if the host supports this. This is used in
the implementation of GCM, which is widely used in IPsec VPN and HTTPS.
Somewhat surprising results: on my ThunderX2, enabling this on top of
the AES acceleration I sent out earlier, the speedup is substantial.
(1420 is a typical IPsec block size - in HTTPS, GCM operates on much
larger block sizes but the kernel mode benchmarks are not the best place
to measure its performance in this mode)
tcrypt: testing speed of rfc4106(gcm(aes)) (rfc4106-gcm-aesni) encryption
No acceleration
tcrypt: test 5 (160 bit key, 1420 byte blocks): 10046 operations in 1 seconds
(14265320 bytes)
AES acceleration
tcrypt: test 5 (160 bit key, 1420 byte blocks): 13970 operations in 1 seconds
(19837400 bytes)
AES + PMULL acceleration
tcrypt: test 5 (160 bit key, 1420 byte blocks): 24372 operations in 1 seconds
(34608240 bytes)
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
host/include/aarch64/host/cpuinfo.h | 1 +
target/i386/ops_sse.h | 24 ++++++++++++++++++++
util/cpuinfo-aarch64.c | 1 +
3 files changed, 26 insertions(+)
diff --git a/host/include/aarch64/host/cpuinfo.h
b/host/include/aarch64/host/cpuinfo.h
index 05feeb4f4369fc19..da268dce1390cac0 100644
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -10,6 +10,7 @@
#define CPUINFO_LSE (1u << 1)
#define CPUINFO_LSE2 (1u << 2)
#define CPUINFO_AES (1u << 3)
+#define CPUINFO_PMULL (1u << 4)
/* Initialized with a constructor. */
extern unsigned cpuinfo;
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index db79132778efd211..d7e7bd8b733122a8 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2157,6 +2157,30 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env,
Reg *d, Reg *v, Reg *s,
uint64_t a, b;
int i;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_PMULL) {
+ aes_vec_t vv = *(aes_vec_t *)v, vs = *(aes_vec_t *)s;
+ aes_vec_t *vd = (aes_vec_t *)d;
+
+ switch (ctrl & 0x11) {
+ case 0x1:
+ asm("ext %0.16b, %0.16b, %0.16b, #8":"+w"(vv));
+ /* fallthrough */
+ case 0x0:
+ asm(".arch_extension aes\n"
+ "pmull %0.1q, %1.1d, %2.1d":"=w"(*vd):"w"(vv),"w"(vs));
+ break;
+ case 0x10:
+ asm("ext %0.16b, %0.16b, %0.16b, #8":"+w"(vv));
+ /* fallthrough */
+ case 0x11:
+ asm(".arch_extension aes\n"
+ "pmull2 %0.1q, %1.2d, %2.2d":"=w"(*vd):"w"(vv),"w"(vs));
+ }
+ return;
+ }
+#endif
+
for (i = 0; i < 1 << SHIFT; i += 2) {
a = v->Q(((ctrl & 1) != 0) + i);
b = s->Q(((ctrl & 16) != 0) + i);
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
index 769cdfeb2fc32d5e..95ec1f4adfc829b9 100644
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@@ -57,6 +57,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
+ info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
#endif
#ifdef CONFIG_DARWIN
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
--
2.39.2