[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PULL 31/39] target/i386: Destructive vector helpers for AVX
From: |
Paolo Bonzini |
Subject: |
[PULL 31/39] target/i386: Destructive vector helpers for AVX |
Date: |
Thu, 1 Sep 2022 20:24:21 +0200 |
From: Paul Brook <paul@nowt.org>
These helpers need to take special care to avoid overwriting source values
before the wole result has been calculated. Currently they use a dummy
Reg typed variable to store the result then assign the whole register.
This will cause 128 bit operations to corrupt the upper half of the register,
so replace it with explicit temporaries and element assignments.
Signed-off-by: Paul Brook <paul@nowt.org>
Message-Id: <20220424220204.2493824-14-paul@nowt.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/ops_sse.h | 556 ++++++++++++++++++++----------------------
1 file changed, 262 insertions(+), 294 deletions(-)
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 557cc7ce7d..bb50ca6f8d 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -41,6 +41,7 @@
#endif
#define LANE_WIDTH (SHIFT ? 16 : 8)
+#define PACK_WIDTH (LANE_WIDTH / 2)
/*
* Copy the relevant parts of a Reg value around. In the case where
@@ -474,71 +475,81 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
}
#endif
+#define SHUFFLE4(F, a, b, offset) do { \
+ r0 = a->F((order & 3) + offset); \
+ r1 = a->F(((order >> 2) & 3) + offset); \
+ r2 = b->F(((order >> 4) & 3) + offset); \
+ r3 = b->F(((order >> 6) & 3) + offset); \
+ d->F(offset) = r0; \
+ d->F(offset + 1) = r1; \
+ d->F(offset + 2) = r2; \
+ d->F(offset + 3) = r3; \
+ } while (0)
+
#if SHIFT == 0
void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- MOVE(*d, r);
+ SHUFFLE4(W, s, s, 0);
}
#else
void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ Reg *v = d;
+ uint32_t r0, r1, r2, r3;
+ int i;
- r.L(0) = d->L(order & 3);
- r.L(1) = d->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ SHUFFLE4(L, v, s, i);
+ }
}
void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ Reg *v = d;
+ uint64_t r0, r1;
+ int i;
- r.Q(0) = d->Q(order & 1);
- r.Q(1) = s->Q((order >> 1) & 1);
- MOVE(*d, r);
+ for (i = 0; i < 1 << SHIFT; i += 2) {
+ r0 = v->Q(((order & 1) & 1) + i);
+ r1 = s->Q(((order >> 1) & 1) + i);
+ d->Q(i) = r0;
+ d->Q(i + 1) = r1;
+ order >>= 2;
+ }
}
void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint32_t r0, r1, r2, r3;
+ int i;
- r.L(0) = s->L(order & 3);
- r.L(1) = s->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ SHUFFLE4(L, s, s, i);
+ }
}
void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
+ int i, j;
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- r.Q(1) = s->Q(1);
- MOVE(*d, r);
+ for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
+ SHUFFLE4(W, s, s, i);
+ d->Q(j) = s->Q(j);
+ }
}
void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
+ int i, j;
- r.Q(0) = s->Q(0);
- r.W(4) = s->W(4 + (order & 3));
- r.W(5) = s->W(4 + ((order >> 2) & 3));
- r.W(6) = s->W(4 + ((order >> 4) & 3));
- r.W(7) = s->W(4 + ((order >> 6) & 3));
- MOVE(*d, r);
+ for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
+ d->Q(j) = s->Q(j);
+ SHUFFLE4(W, s, s, i);
+ }
}
#endif
@@ -1091,156 +1102,132 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State
*env, Reg *s)
return val;
}
-void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.B(0) = satsb((int16_t)d->W(0));
- r.B(1) = satsb((int16_t)d->W(1));
- r.B(2) = satsb((int16_t)d->W(2));
- r.B(3) = satsb((int16_t)d->W(3));
-#if SHIFT == 1
- r.B(4) = satsb((int16_t)d->W(4));
- r.B(5) = satsb((int16_t)d->W(5));
- r.B(6) = satsb((int16_t)d->W(6));
- r.B(7) = satsb((int16_t)d->W(7));
-#endif
- r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
- r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
- r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
- r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
-#if SHIFT == 1
- r.B(12) = satsb((int16_t)s->W(4));
- r.B(13) = satsb((int16_t)s->W(5));
- r.B(14) = satsb((int16_t)s->W(6));
- r.B(15) = satsb((int16_t)s->W(7));
-#endif
- MOVE(*d, r);
+#define PACK_HELPER_B(name, F) \
+void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int j, k; \
+ for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
+ for (k = 0; k < PACK_WIDTH; k++) { \
+ r[k] = F((int16_t)v->W(j + k)); \
+ } \
+ for (k = 0; k < PACK_WIDTH; k++) { \
+ r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
+ } \
+ for (k = 0; k < PACK_WIDTH * 2; k++) { \
+ d->B(2 * j + k) = r[k]; \
+ } \
+ } \
}
-void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.B(0) = satub((int16_t)d->W(0));
- r.B(1) = satub((int16_t)d->W(1));
- r.B(2) = satub((int16_t)d->W(2));
- r.B(3) = satub((int16_t)d->W(3));
-#if SHIFT == 1
- r.B(4) = satub((int16_t)d->W(4));
- r.B(5) = satub((int16_t)d->W(5));
- r.B(6) = satub((int16_t)d->W(6));
- r.B(7) = satub((int16_t)d->W(7));
-#endif
- r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
- r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
- r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
- r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
-#if SHIFT == 1
- r.B(12) = satub((int16_t)s->W(4));
- r.B(13) = satub((int16_t)s->W(5));
- r.B(14) = satub((int16_t)s->W(6));
- r.B(15) = satub((int16_t)s->W(7));
-#endif
- MOVE(*d, r);
-}
+PACK_HELPER_B(sswb, satsb)
+PACK_HELPER_B(uswb, satub)
void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
- Reg r;
+ Reg *v = d;
+ uint16_t r[PACK_WIDTH];
+ int j, k;
- r.W(0) = satsw(d->L(0));
- r.W(1) = satsw(d->L(1));
-#if SHIFT == 1
- r.W(2) = satsw(d->L(2));
- r.W(3) = satsw(d->L(3));
-#endif
- r.W((2 << SHIFT) + 0) = satsw(s->L(0));
- r.W((2 << SHIFT) + 1) = satsw(s->L(1));
-#if SHIFT == 1
- r.W(6) = satsw(s->L(2));
- r.W(7) = satsw(s->L(3));
-#endif
- MOVE(*d, r);
+ for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
+ for (k = 0; k < PACK_WIDTH / 2; k++) {
+ r[k] = satsw(v->L(j + k));
+ }
+ for (k = 0; k < PACK_WIDTH / 2; k++) {
+ r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
+ }
+ for (k = 0; k < PACK_WIDTH; k++) {
+ d->W(2 * j + k) = r[k];
+ }
+ }
}
#define UNPCK_OP(base_name, base) \
\
void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int j, i; \
\
- r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
- r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
- r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
- r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
- r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
- r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
- r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
- r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
- XMM_ONLY( \
- r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
- r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
- r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
- r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
- r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
- r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
- r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
- r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 8 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH; \
+ for (i = 0; i < PACK_WIDTH; i++) { \
+ r[2 * i] = v->B(k + i); \
+ r[2 * i + 1] = s->B(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
+ d->B(j) = r[i]; \
+ } \
+ } \
} \
\
void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint16_t r[PACK_WIDTH]; \
+ int j, i; \
\
- r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
- r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
- r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
- r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
- XMM_ONLY( \
- r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
- r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
- r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
- r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 4 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH / 2; \
+ for (i = 0; i < PACK_WIDTH / 2; i++) { \
+ r[2 * i] = v->W(k + i); \
+ r[2 * i + 1] = s->W(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH; i++, j++) { \
+ d->W(j) = r[i]; \
+ } \
+ } \
} \
\
void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint32_t r[PACK_WIDTH / 2]; \
+ int j, i; \
\
- r.L(0) = d->L((base << SHIFT) + 0); \
- r.L(1) = s->L((base << SHIFT) + 0); \
- XMM_ONLY( \
- r.L(2) = d->L((base << SHIFT) + 1); \
- r.L(3) = s->L((base << SHIFT) + 1); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 2 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH / 4; \
+ for (i = 0; i < PACK_WIDTH / 4; i++) { \
+ r[2 * i] = v->L(k + i); \
+ r[2 * i + 1] = s->L(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
+ d->L(j) = r[i]; \
+ } \
+ } \
} \
\
XMM_ONLY( \
- \
+ CPUX86State *env, Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint64_t r[2]; \
+ int i; \
\
- r.Q(0) = d->Q(base); \
- r.Q(1) = s->Q(base); \
- MOVE(*d, r); \
+ for (i = 0; i < 1 << SHIFT; i += 2) { \
+ r[0] = v->Q(base + i); \
+ r[1] = s->Q(base + i); \
+ d->Q(i) = r[0]; \
+ d->Q(i + 1) = r[1]; \
+ } \
} \
)
UNPCK_OP(l, 0)
UNPCK_OP(h, 1)
+#undef PACK_WIDTH
+#undef PACK_HELPER_B
+#undef UNPCK_OP
+
+
/* 3DNow! float ops */
#if SHIFT == 0
void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
@@ -1393,122 +1380,86 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg
*s)
/* SSSE3 op helpers */
void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
+ Reg *v = d;
int i;
- Reg r;
+#if SHIFT == 0
+ uint8_t r[8];
- for (i = 0; i < (8 << SHIFT); i++) {
- r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
+ for (i = 0; i < 8; i++) {
+ r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
}
+ for (i = 0; i < 8; i++) {
+ d->B(i) = r[i];
+ }
+#else
+ uint8_t r[8 << SHIFT];
- MOVE(*d, r);
-}
-
-void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
-
- Reg r;
-
- r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
- r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
- XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
- XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
- r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
- r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
- XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
- XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
-
- MOVE(*d, r);
-}
-
-void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
- XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
- r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
- XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
-
- MOVE(*d, r);
-}
-
-void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
- r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
- XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
- XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
- r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
- r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
- XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
- XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
-
- MOVE(*d, r);
-}
-
-void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
- (int8_t)s->B(1) * (uint8_t)d->B(1));
- d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) +
- (int8_t)s->B(3) * (uint8_t)d->B(3));
- d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) +
- (int8_t)s->B(5) * (uint8_t)d->B(5));
- d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) +
- (int8_t)s->B(7) * (uint8_t)d->B(7));
-#if SHIFT == 1
- d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) +
- (int8_t)s->B(9) * (uint8_t)d->B(9));
- d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
- (int8_t)s->B(11) * (uint8_t)d->B(11));
- d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
- (int8_t)s->B(13) * (uint8_t)d->B(13));
- d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
- (int8_t)s->B(15) * (uint8_t)d->B(15));
+ for (i = 0; i < 8 << SHIFT; i++) {
+ int j = i & ~0xf;
+ r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
+ }
+ for (i = 0; i < 8 << SHIFT; i++) {
+ d->B(i) = r[i];
+ }
#endif
}
-void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
- r.W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
- XMM_ONLY(r.W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
- XMM_ONLY(r.W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
- r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
- r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
- XMM_ONLY(r.W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
- XMM_ONLY(r.W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
- MOVE(*d, r);
+#define SSE_HELPER_HW(name, F) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint16_t r[4 << SHIFT]; \
+ int i, j, k; \
+ for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
+ for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
+ r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \
+ } \
+ for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
+ r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \
+ } \
+ } \
+ for (i = 0; i < 4 << SHIFT; i++) { \
+ d->W(i) = r[i]; \
+ } \
}
-void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
- XMM_ONLY(r.L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
- r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
- XMM_ONLY(r.L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
- MOVE(*d, r);
+#define SSE_HELPER_HL(name, F) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint32_t r[2 << SHIFT]; \
+ int i, j, k; \
+ for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
+ for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
+ r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \
+ } \
+ for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
+ r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \
+ } \
+ } \
+ for (i = 0; i < 2 << SHIFT; i++) { \
+ d->L(i) = r[i]; \
+ } \
}
-void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
+SSE_HELPER_HW(phaddw, FADD)
+SSE_HELPER_HW(phsubw, FSUB)
+SSE_HELPER_HW(phaddsw, FADDSW)
+SSE_HELPER_HW(phsubsw, FSUBSW)
+SSE_HELPER_HL(phaddd, FADD)
+SSE_HELPER_HL(phsubd, FSUB)
- r.W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
- r.W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
- XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
- XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
- r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
- r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
- XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
- XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
- MOVE(*d, r);
+#undef SSE_HELPER_HW
+#undef SSE_HELPER_HL
+
+void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ Reg *v = d;
+ int i;
+ for (i = 0; i < 4 << SHIFT; i++) {
+ d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
+ (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
+ }
}
#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
@@ -1531,32 +1482,38 @@ SSE_HELPER_L(helper_psignd, FSIGNL)
void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
int32_t shift)
{
- Reg r;
+ Reg *v = d;
+ int i;
/* XXX could be checked during translation */
- if (shift >= (16 << SHIFT)) {
- r.Q(0) = 0;
- XMM_ONLY(r.Q(1) = 0);
+ if (shift >= (SHIFT ? 32 : 16)) {
+ for (i = 0; i < (1 << SHIFT); i++) {
+ d->Q(i) = 0;
+ }
} else {
shift <<= 3;
#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
#if SHIFT == 0
- r.Q(0) = SHR(s->Q(0), shift - 0) |
- SHR(d->Q(0), shift - 64);
+ d->Q(0) = SHR(s->Q(0), shift - 0) |
+ SHR(v->Q(0), shift - 64);
#else
- r.Q(0) = SHR(s->Q(0), shift - 0) |
- SHR(s->Q(1), shift - 64) |
- SHR(d->Q(0), shift - 128) |
- SHR(d->Q(1), shift - 192);
- r.Q(1) = SHR(s->Q(0), shift + 64) |
- SHR(s->Q(1), shift - 0) |
- SHR(d->Q(0), shift - 64) |
- SHR(d->Q(1), shift - 128);
+ for (i = 0; i < (1 << SHIFT); i += 2) {
+ uint64_t r0, r1;
+
+ r0 = SHR(s->Q(i), shift - 0) |
+ SHR(s->Q(i + 1), shift - 64) |
+ SHR(v->Q(i), shift - 128) |
+ SHR(v->Q(i + 1), shift - 192);
+ r1 = SHR(s->Q(i), shift + 64) |
+ SHR(s->Q(i + 1), shift - 0) |
+ SHR(v->Q(i), shift - 64) |
+ SHR(v->Q(i + 1), shift - 128);
+ d->Q(i) = r0;
+ d->Q(i + 1) = r1;
+ }
#endif
#undef SHR
}
-
- MOVE(*d, r);
}
#define XMM0 (env->xmm_regs[0])
@@ -1681,17 +1638,23 @@ SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
- Reg r;
+ Reg *v = d;
+ uint16_t r[8];
+ int i, j, k;
- r.W(0) = satuw((int32_t) d->L(0));
- r.W(1) = satuw((int32_t) d->L(1));
- r.W(2) = satuw((int32_t) d->L(2));
- r.W(3) = satuw((int32_t) d->L(3));
- r.W(4) = satuw((int32_t) s->L(0));
- r.W(5) = satuw((int32_t) s->L(1));
- r.W(6) = satuw((int32_t) s->L(2));
- r.W(7) = satuw((int32_t) s->L(3));
- MOVE(*d, r);
+ for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
+ r[0] = satuw(v->L(j));
+ r[1] = satuw(v->L(j + 1));
+ r[2] = satuw(v->L(j + 2));
+ r[3] = satuw(v->L(j + 3));
+ r[4] = satuw(s->L(j));
+ r[5] = satuw(s->L(j + 1));
+ r[6] = satuw(s->L(j + 2));
+ r[7] = satuw(s->L(j + 3));
+ for (k = 0; k < 8; k++) {
+ d->W(i + k) = r[k];
+ }
+ }
}
#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
@@ -1947,20 +1910,25 @@ void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg
*d, Reg *s, uint32_t mask)
void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
uint32_t offset)
{
- int s0 = (offset & 3) << 2;
- int d0 = (offset & 4) << 0;
- int i;
- Reg r;
+ Reg *v = d;
+ int i, j;
+ uint16_t r[8];
- for (i = 0; i < 8; i++, d0++) {
- r.W(i) = 0;
- r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
- r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
- r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
- r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
+ for (j = 0; j < 4 << SHIFT; ) {
+ int s0 = (j * 2) + ((offset & 3) << 2);
+ int d0 = (j * 2) + ((offset & 4) << 0);
+ for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
+ r[i] = 0;
+ r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
+ r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
+ r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
+ r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
+ }
+ for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
+ d->W(j) = r[i];
+ }
+ offset >>= 3;
}
-
- MOVE(*d, r);
}
/* SSE4.2 op helpers */
--
2.37.2
- [PULL 20/39] target/i386: Rework sse_op_table6/7, (continued)
- [PULL 20/39] target/i386: Rework sse_op_table6/7, Paolo Bonzini, 2022/09/01
- [PULL 22/39] target/i386: check SSE table flags instead of hardcoding opcodes, Paolo Bonzini, 2022/09/01
- [PULL 18/39] target/i386: Add ZMM_OFFSET macro, Paolo Bonzini, 2022/09/01
- [PULL 23/39] target/i386: isolate MMX code more, Paolo Bonzini, 2022/09/01
- [PULL 26/39] target/i386: Add CHECK_NO_VEX, Paolo Bonzini, 2022/09/01
- [PULL 27/39] target/i386: rewrite destructive 3DNow operations, Paolo Bonzini, 2022/09/01
- [PULL 25/39] target/i386: do not cast gen_helper_* function pointers, Paolo Bonzini, 2022/09/01
- [PULL 24/39] target/i386: Add size suffix to vector FP helpers, Paolo Bonzini, 2022/09/01
- [PULL 30/39] target/i386: Misc integer AVX helper prep, Paolo Bonzini, 2022/09/01
- [PULL 33/39] target/i386: reimplement AVX comparison helpers, Paolo Bonzini, 2022/09/01
- [PULL 31/39] target/i386: Destructive vector helpers for AVX,
Paolo Bonzini <=
- [PULL 29/39] target/i386: Rewrite simple integer vector helpers, Paolo Bonzini, 2022/09/01
- [PULL 34/39] target/i386: Dot product AVX helper prep, Paolo Bonzini, 2022/09/01
- [PULL 37/39] target/i386: Rewrite blendv helpers, Paolo Bonzini, 2022/09/01
- [PULL 28/39] target/i386: Rewrite vector shift helper, Paolo Bonzini, 2022/09/01
- [PULL 32/39] target/i386: Floating point arithmetic helper AVX prep, Paolo Bonzini, 2022/09/01
- [PULL 38/39] target/i386: AVX pclmulqdq prep, Paolo Bonzini, 2022/09/01
- [PULL 35/39] target/i386: Destructive FP helpers for AVX, Paolo Bonzini, 2022/09/01
- [PULL 36/39] target/i386: Misc AVX helper prep, Paolo Bonzini, 2022/09/01
- [PULL 39/39] target/i386: AVX+AES helpers prep, Paolo Bonzini, 2022/09/01
- Re: [PULL 00/39] i386, SCSI, build system changes for 2022-09-01, Stefan Hajnoczi, 2022/09/05