[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v2 2/2] target/i386: Implement AES instructions using AArch64 cou
From: |
Ard Biesheuvel |
Subject: |
[PATCH v2 2/2] target/i386: Implement AES instructions using AArch64 counterparts |
Date: |
Wed, 31 May 2023 13:22:39 +0200 |
When available, use the AArch64 AES instructions to implement the x86
ones. These are not a 1:1 fit, but considerably more efficient, and
without data dependent timing.
For a typical benchmark (linux tcrypt mode=500), this gives a 2-3x
speedup when running on ThunderX2.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
host/include/aarch64/host/cpuinfo.h | 1 +
target/i386/ops_sse.h | 69 ++++++++++++++++++++
util/cpuinfo-aarch64.c | 1 +
3 files changed, 71 insertions(+)
diff --git a/host/include/aarch64/host/cpuinfo.h
b/host/include/aarch64/host/cpuinfo.h
index 82227890b4b4db03..05feeb4f4369fc19 100644
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -9,6 +9,7 @@
#define CPUINFO_ALWAYS (1u << 0) /* so cpuinfo is nonzero */
#define CPUINFO_LSE (1u << 1)
#define CPUINFO_LSE2 (1u << 2)
+#define CPUINFO_AES (1u << 3)
/* Initialized with a constructor. */
extern unsigned cpuinfo;
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index fb63af7afa21588d..db79132778efd211 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -20,6 +20,11 @@
#include "crypto/aes.h"
+#ifdef __aarch64__
+#include "host/cpuinfo.h"
+typedef uint8_t aes_vec_t __attribute__((vector_size(16)));
+#endif
+
#if SHIFT == 0
#define Reg MMXReg
#define XMM_ONLY(...)
@@ -2165,6 +2170,20 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg
*d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aesd %0.16b, %1.16b \n"
+ " aesimc %0.16b, %0.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0 ; i < 2 << SHIFT ; i++) {
int j = i & 3;
d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
@@ -2180,6 +2199,19 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env,
Reg *d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aesd %0.16b, %1.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0; i < 8 << SHIFT; i++) {
d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
}
@@ -2191,6 +2223,20 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg
*d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aese %0.16b, %1.16b \n"
+ " aesmc %0.16b, %0.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0 ; i < 2 << SHIFT ; i++) {
int j = i & 3;
d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
@@ -2206,6 +2252,19 @@ void glue(helper_aesenclast, SUFFIX)(CPUX86State *env,
Reg *d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aese %0.16b, %1.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0; i < 8 << SHIFT; i++) {
d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
}
@@ -2217,6 +2276,16 @@ void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg
*d, Reg *s)
int i;
Reg tmp = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aesimc %0.16b, %1.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"(*(aes_vec_t *)s));
+ return;
+ }
+#endif
+
for (i = 0 ; i < 4 ; i++) {
d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
AES_imc[tmp.B(4 * i + 1)][1] ^
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
index f99acb788454e5ab..769cdfeb2fc32d5e 100644
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@@ -56,6 +56,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
unsigned long hwcap = qemu_getauxval(AT_HWCAP);
info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
+ info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
#endif
#ifdef CONFIG_DARWIN
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
--
2.39.2