[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 1/2] target/arm: Use x86 intrinsics to implement PMULL.P64
From: |
Ard Biesheuvel |
Subject: |
Re: [PATCH 1/2] target/arm: Use x86 intrinsics to implement PMULL.P64 |
Date: |
Thu, 1 Jun 2023 17:28:54 +0200 |
On Thu, 1 Jun 2023 at 15:01, Peter Maydell <peter.maydell@linaro.org> wrote:
>
> On Thu, 1 Jun 2023 at 13:33, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> > host/include/i386/host/cpuinfo.h | 1 +
> > target/arm/tcg/vec_helper.c | 26 +++++++++++++++++++-
> > util/cpuinfo-i386.c | 1 +
> > 3 files changed, 27 insertions(+), 1 deletion(-)
> >
> > diff --git a/host/include/i386/host/cpuinfo.h
> > b/host/include/i386/host/cpuinfo.h
> > index 073d0a426f31487d..cf4ced844760d28f 100644
> > --- a/host/include/i386/host/cpuinfo.h
> > +++ b/host/include/i386/host/cpuinfo.h
> > @@ -27,6 +27,7 @@
> > #define CPUINFO_ATOMIC_VMOVDQA (1u << 16)
> > #define CPUINFO_ATOMIC_VMOVDQU (1u << 17)
> > #define CPUINFO_AES (1u << 18)
> > +#define CPUINFO_PMULL (1u << 19)
> >
> > /* Initialized with a constructor. */
> > extern unsigned cpuinfo;
> > diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
> > index f59d3b26eacf08f8..fb422627588439b3 100644
> > --- a/target/arm/tcg/vec_helper.c
> > +++ b/target/arm/tcg/vec_helper.c
> > @@ -25,6 +25,14 @@
> > #include "qemu/int128.h"
> > #include "vec_internal.h"
> >
> > +#ifdef __x86_64__
> > +#include "host/cpuinfo.h"
> > +#include <wmmintrin.h>
> > +#define TARGET_PMULL __attribute__((__target__("pclmul")))
> > +#else
> > +#define TARGET_PMULL
> > +#endif
> > +
> > /*
> > * Data for expanding active predicate bits to bytes, for byte elements.
> > *
> > @@ -2010,12 +2018,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void
> > *vm, uint32_t desc)
> > * Because of the lanes are not accessed in strict columns,
> > * this probably cannot be turned into a generic helper.
> > */
> > -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
> > +void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm,
> > uint32_t desc)
> > {
> > intptr_t i, j, opr_sz = simd_oprsz(desc);
> > intptr_t hi = simd_data(desc);
> > uint64_t *d = vd, *n = vn, *m = vm;
> >
> > +#ifdef __x86_64__
> > + if (cpuinfo & CPUINFO_PMULL) {
> > + switch (hi) {
> > + case 0:
> > + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm,
> > *(__m128i *)vn, 0x0);
> > + break;
> > + case 1:
> > + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm,
> > *(__m128i *)vn, 0x11);
> > + break;
> > + default:
> > + g_assert_not_reached();
> > + }
> > + return;
> > + }
> > +#endif
>
> This needs to cope with the input vectors being more than
> just 128 bits wide, I think. Also you probably still
> need the clear_tail() to clear any high bits of the register.
>
Ah yes, I missed that completely.