[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC PATCH v4 74/75] target/i386: convert pshuf(w, lw, hw,
From: |
Jan Bobek |
Subject: |
[Qemu-devel] [RFC PATCH v4 74/75] target/i386: convert pshuf(w, lw, hw, d), shuf(pd, ps) helpers to gvec style |
Date: |
Wed, 21 Aug 2019 13:29:50 -0400 |
Make these helpers suitable for use with tcg_gen_gvec_* functions.
Signed-off-by: Jan Bobek <address@hidden>
---
target/i386/ops_sse.h | 141 ++++++++++++++++++++++++-----------
target/i386/ops_sse_header.h | 12 +--
target/i386/translate.c | 34 ++++-----
3 files changed, 119 insertions(+), 68 deletions(-)
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 8172324e34..2e50d91a25 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -551,70 +551,123 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg
*a, Reg *b,
}
#if SHIFT == 0
-void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- *d = r;
+ for (intptr_t i = 0; 4 * i * sizeof(uint16_t) < oprsz; ++i) {
+ const uint16_t t0 = a->W(4 * i + ((ctrl >> 0) & 3));
+ const uint16_t t1 = a->W(4 * i + ((ctrl >> 2) & 3));
+ const uint16_t t2 = a->W(4 * i + ((ctrl >> 4) & 3));
+ const uint16_t t3 = a->W(4 * i + ((ctrl >> 6) & 3));
+
+ d->W(4 * i + 0) = t0;
+ d->W(4 * i + 1) = t1;
+ d->W(4 * i + 2) = t2;
+ d->W(4 * i + 3) = t3;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
#else
-void helper_shufps(Reg *d, Reg *s, int order)
+void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
- r.L(0) = d->L(order & 3);
- r.L(1) = d->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- *d = r;
+ for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) {
+ const uint16_t t0 = a->W(8 * i + ((ctrl >> 0) & 3));
+ const uint16_t t1 = a->W(8 * i + ((ctrl >> 2) & 3));
+ const uint16_t t2 = a->W(8 * i + ((ctrl >> 4) & 3));
+ const uint16_t t3 = a->W(8 * i + ((ctrl >> 6) & 3));
+
+ d->W(8 * i + 0) = t0;
+ d->W(8 * i + 1) = t1;
+ d->W(8 * i + 2) = t2;
+ d->W(8 * i + 3) = t3;
+ d->Q(2 * i + 1) = a->Q(2 * i + 1);
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void helper_shufpd(Reg *d, Reg *s, int order)
+void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) {
+ const uint16_t t0 = a->W(8 * i + 4 + ((ctrl >> 0) & 3));
+ const uint16_t t1 = a->W(8 * i + 4 + ((ctrl >> 2) & 3));
+ const uint16_t t2 = a->W(8 * i + 4 + ((ctrl >> 4) & 3));
+ const uint16_t t3 = a->W(8 * i + 4 + ((ctrl >> 6) & 3));
- r.Q(0) = d->Q(order & 1);
- r.Q(1) = s->Q((order >> 1) & 1);
- *d = r;
+ d->Q(2 * i + 0) = a->Q(2 * i + 0);
+ d->W(8 * i + 4) = t0;
+ d->W(8 * i + 5) = t1;
+ d->W(8 * i + 6) = t2;
+ d->W(8 * i + 7) = t3;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) {
+ const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3));
+ const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3));
+ const uint32_t t2 = a->L(4 * i + ((ctrl >> 4) & 3));
+ const uint32_t t3 = a->L(4 * i + ((ctrl >> 6) & 3));
+
+ d->L(4 * i + 0) = t0;
+ d->L(4 * i + 1) = t1;
+ d->L(4 * i + 2) = t2;
+ d->L(4 * i + 3) = t3;
- r.L(0) = s->L(order & 3);
- r.L(1) = s->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- *d = r;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufps, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc)
{
- Reg r;
-
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- r.Q(1) = s->Q(1);
- *d = r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) {
+ const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3));
+ const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3));
+ const uint32_t t2 = b->L(4 * i + ((ctrl >> 4) & 3));
+ const uint32_t t3 = b->L(4 * i + ((ctrl >> 6) & 3));
+
+ d->W(4 * i + 0) = t0;
+ d->W(4 * i + 1) = t1;
+ d->W(4 * i + 2) = t2;
+ d->W(4 * i + 3) = t3;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc)
{
- Reg r;
-
- r.Q(0) = s->Q(0);
- r.W(4) = s->W(4 + (order & 3));
- r.W(5) = s->W(4 + ((order >> 2) & 3));
- r.W(6) = s->W(4 + ((order >> 4) & 3));
- r.W(7) = s->W(4 + ((order >> 6) & 3));
- *d = r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 2 * i * sizeof(uint64_t) < oprsz; ++i) {
+ const uint64_t t0 = a->Q(2 * i + ((ctrl >> 0) & 1));
+ const uint64_t t1 = b->Q(2 * i + ((ctrl >> 1) & 1));
+
+ d->Q(2 * i + 0) = t0;
+ d->Q(2 * i + 1) = t1;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
#endif
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index ee8bd4c1af..207d41e248 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -78,13 +78,13 @@ DEF_HELPER_4(glue(psadbw, SUFFIX), void, Reg, Reg, Reg, i32)
DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl)
#if SHIFT == 0
-DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, i32)
#else
-DEF_HELPER_3(shufps, void, Reg, Reg, int)
-DEF_HELPER_3(shufpd, void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, i32)
+DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, i32)
#endif
#if SHIFT == 1
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 3554086336..bb4120a848 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -2763,8 +2763,6 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
[0x5b] = { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvttps2dq
},
[0xc2] = SSE_FOP(cmpeq),
- [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
- (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
/* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */
[0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
@@ -6971,22 +6969,22 @@ DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_mmx, Pq, Pq, Qq)
DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_xmm, Vdq, Vdq, Wdq)
DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vdq, Hdq, Wdq)
DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vqq, Hqq, Wqq)
-DEF_GEN_INSN3_HELPER_PPI(pshufw, pshufw_mmx, Pq, Qq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshuflw, pshuflw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshufhw, pshufhw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshufd, pshufd_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(shufps, shufps, Vdq, Vdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vdq, Hdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vqq, Hqq, Wqq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(shufpd, shufpd, Vdq, Vdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vdq, Hdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vqq, Hqq, Wqq, Ib)
+DEF_GEN_INSN3_GVEC(pshufw, Pq, Qq, Ib, 2i_ool, MM_OPRSZ, MM_MAXSZ, pshufw_mmx)
+DEF_GEN_INSN3_GVEC(pshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(vpshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(vpshuflw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(pshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(vpshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(vpshufhw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(pshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshufd_xmm)
+DEF_GEN_INSN3_GVEC(vpshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshufd_xmm)
+DEF_GEN_INSN3_GVEC(vpshufd, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ,
pshufd_xmm)
+DEF_GEN_INSN4_GVEC(shufps, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ,
shufps_xmm)
+DEF_GEN_INSN4_GVEC(vshufps, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ,
shufps_xmm)
+DEF_GEN_INSN4_GVEC(vshufps, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ,
shufps_xmm)
+DEF_GEN_INSN4_GVEC(shufpd, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ,
shufpd_xmm)
+DEF_GEN_INSN4_GVEC(vshufpd, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ,
shufpd_xmm)
+DEF_GEN_INSN4_GVEC(vshufpd, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ,
shufpd_xmm)
DEF_GEN_INSN4_HELPER_EPPI(blendps, blendps_xmm, Vdq, Vdq, Wdq, Ib)
DEF_GEN_INSN4_HELPER_EPPI(vblendps, blendps_xmm, Vdq, Hdq, Wdq, Ib)
--
2.20.1
- [Qemu-devel] [RFC PATCH v4 55/75] target/i386: introduce SSE4.2 vector instructions to sse-opcode.inc.h, (continued)
- [Qemu-devel] [RFC PATCH v4 55/75] target/i386: introduce SSE4.2 vector instructions to sse-opcode.inc.h, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 57/75] target/i386: introduce AES and PCLMULQDQ code generators, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 58/75] target/i386: introduce AES and PCLMULQDQ vector instructions to sse-opcode.inc.h, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 65/75] target/i386: remove obsoleted helpers, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 49/75] target/i386: introduce SSSE3 code generators, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 70/75] target/i386: convert pavgb/pavgw helpers to gvec style, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 71/75] target/i386: convert pmuludq/pmaddwd helpers to gvec style, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 73/75] target/i386: remove obsoleted helper_mov(l, q)_mm_T0, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 68/75] target/i386: convert ps((l, r)l(w, d, q), ra(w, d)) to helpers to gvec style, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 63/75] target/i386: introduce AVX2 code generators, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 74/75] target/i386: convert pshuf(w, lw, hw, d), shuf(pd, ps) helpers to gvec style,
Jan Bobek <=
- [Qemu-devel] [RFC PATCH v4 64/75] target/i386: introduce AVX2 vector instructions to sse-opcode.inc.h, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 60/75] target/i386: introduce AVX code generators, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 72/75] target/i386: convert psadbw helper to gvec style, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 75/75] target/i386: convert pmovmskb/movmskps/movmskpd helpers to gvec style, Jan Bobek, 2019/08/21
- [Qemu-devel] [RFC PATCH v4 61/75] target/i386: introduce AVX vector instructions to sse-opcode.inc.h, Jan Bobek, 2019/08/21