qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 28/37] target/i386: reimplement 0x0f 0x38, add AVX


From: Richard Henderson
Subject: Re: [PATCH 28/37] target/i386: reimplement 0x0f 0x38, add AVX
Date: Tue, 13 Sep 2022 10:31:37 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.11.0

On 9/12/22 00:04, Paolo Bonzini wrote:
+void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+    uint64_t zf = 0, cf = 0;

uint32_t, to match the size of the operation.

+    int i;
+
+    for (i = 0; i < 2 << SHIFT; i++) {
+        zf |= (s->L(i) &  d->L(i));
+        cf |= (s->L(i) & ~d->L(i));
+    }


+void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
+                                        Reg *v, Reg *s, target_ulong a0)
+{
+    int i;
+
+    for (i = 0; i < (2 << SHIFT); i++) {
+        if (v->L(i) >> 31) {
+            cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
+        }
+    }
+}
+
+void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
+                                        Reg *v, Reg *s, target_ulong a0)
+{
+    int i;
+
+    for (i = 0; i < (1 << SHIFT); i++) {
+        if (v->Q(i) >> 63) {
+            cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
+        }
+    }
+}

Any idea if hw will write incomplete data if the pieces cross page boundaries, and the second page is invalid? We're not good at that for any other vector sized write, though, so not critical.

+void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+    int i;
+
+    for (i = 0; i < (2 << SHIFT); i++) {
+        d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
+    }
+}

This is tcg_gen_cmpsel_vec(TCG_COND_LT, d, v, zero, s, zero).

+void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
+        Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+    int i;
+    for (i = 0; i < (2 << SHIFT); i++) {
+        if (v->L(i) >> 31) {
+            target_ulong addr = a0
+                + ((target_ulong)(int32_t)s->L(i) << scale);
+            d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
+        }
+        v->L(i) = 0;
+    }
+}

Better to not modify registers until all potential #GP are raised.
Also, some missing whitespace between functions.

+    [0x2f] = X86_OP_ENTRY3(,x,  vex4 cpuid(SSE41) avx2_256 p_66),

Whee! Mailer really chomped down on this series.

@@ -384,8 +484,8 @@ static const X86OpEntry opcodes_0F3A[256] = {
      [0x0b] = X86_OP_ENTRY4(VROUNDSD,   V,x,  H,x, W,sd, vex3 cpuid(SSE41) 
p_66),
      [0x0c] = X86_OP_ENTRY4(VBLENDPS,   V,x,  H,x,  W,x,  vex4 cpuid(SSE41) 
p_66),
      [0x0d] = X86_OP_ENTRY4(VBLENDPD,   V,x,  H,x,  W,x,  vex4 cpuid(SSE41) 
p_66),
-    [0x0e] = X86_OP_ENTRY4(VPBLENDW,   V,x,  H,x,  W,x,  vex4 cpuid(SSE41) 
p_66),
-    [0x0f] = X86_OP_ENTRY4(PALIGNR,    V,x,  H,x,  W,x,  vex4 cpuid(SSSE3) mmx 
p_00_66),
+    [0x0e] = X86_OP_ENTRY4(VPBLENDW,   V,x,  H,x,  W,x,  vex4 cpuid(SSE41) 
avx2_256 p_66),
+    [0x0f] = X86_OP_ENTRY4(PALIGNR,    V,x,  H,x,  W,x,  vex4 cpuid(SSSE3) mmx 
avx2_256 p_00_66),

Squash back.

+    case X86_SPECIAL_AVXExtMov:
+        if (!decode.op[2].has_ea) {
+            decode.op[2].ot = s->vex_l ? MO_128 : MO_256;
+        } else if (s->vex_l) {
+            decode.op[2].ot++;
+        }

Clever.

+BINARY_INT_SSE(VPMINSB,    pminsb)
+BINARY_INT_SSE(VPMINUW,    pminuw)
+BINARY_INT_SSE(VPMINUD,    pminud)
+BINARY_INT_SSE(VPMINSD,    pminsd)
+BINARY_INT_SSE(VPMAXSB,    pmaxsb)
+BINARY_INT_SSE(VPMAXUW,    pmaxuw)
+BINARY_INT_SSE(VPMAXUD,    pmaxud)
+BINARY_INT_SSE(VPMAXSD,    pmaxsd)

tcg_gen_gvec_{u,s}{min,max}.

+/* Same as above, but with extra arguments to the helper.  */
+static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decode,
+                                SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
+                                SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
+{
+    SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
+    SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
+    SSEFunc_0_epppti fn = s->rex_w ? q : d;
+    TCGv_i32 scale = tcg_const_i32(decode->mem.scale);

tcg_constant_i32.

+static void gen_VPBROADCASTB(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_ld8u_i32(s->tmp2_i32, s->ptr2, 0);
+    tcg_gen_gvec_dup_i32(MO_8, decode->op[0].offset, vec_len, vec_len, 
s->tmp2_i32);
+}

This is better done with tcg_gen_gvec_dup_mem, where you pass the cpu_env offset of the source data. This lets the host use mem->reg broadcast, which turns out to be more available than reg->reg broadcast.

+static void gen_VPBROADCASTW(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+static void gen_VPBROADCASTD(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+static void gen_VPBROADCASTQ(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)

Likewise.

+static inline void gen_VBROADCASTx128(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decode)
+{
+    tcg_gen_gvec_mov(MO_64, decode->op[0].offset,
+                     decode->op[2].offset, 16, 16);
+    tcg_gen_gvec_mov(MO_64, decode->op[0].offset + offsetof(YMMReg, YMM_X(1)),
+                     decode->op[2].offset, 16, 16);
+}

tcg_gen_dup_mem(MO_128, ...);


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]