[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v6 48/82] target/arm: Pass separate addend to {U, S}DOT helpe
From: |
Peter Maydell |
Subject: |
Re: [PATCH v6 48/82] target/arm: Pass separate addend to {U, S}DOT helpers |
Date: |
Thu, 13 May 2021 11:47:20 +0100 |
On Fri, 30 Apr 2021 at 21:57, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> For SVE, we potentially have a 4th argument coming from the
> movprfx instruction. Currently we do not optimize movprfx,
> so the problem is not visible.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> v4: Fix double addition (zhiwei).
> ---
> target/arm/helper.h | 20 +++---
> target/arm/sve.decode | 7 ++-
> target/arm/translate-a64.c | 15 ++++-
> target/arm/translate-neon.c | 10 +--
> target/arm/translate-sve.c | 13 ++--
> target/arm/vec_helper.c | 120 ++++++++++++++++++++----------------
> 6 files changed, 109 insertions(+), 76 deletions(-)
> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
> index a3d80ecad0..f88e572132 100644
> --- a/target/arm/vec_helper.c
> +++ b/target/arm/vec_helper.c
> @@ -375,71 +375,76 @@ void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void
> *vm,
> * All elements are treated equally, no matter where they are.
> */
>
> -void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc)
> +void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t
> desc)
> {
> intptr_t i, opr_sz = simd_oprsz(desc);
> - uint32_t *d = vd;
> + int32_t *d = vd, *a = va;
Why the type change ?
> int8_t *n = vn, *m = vm;
>
> for (i = 0; i < opr_sz / 4; ++i) {
> - d[i] += n[i * 4 + 0] * m[i * 4 + 0]
> - + n[i * 4 + 1] * m[i * 4 + 1]
> - + n[i * 4 + 2] * m[i * 4 + 2]
> - + n[i * 4 + 3] * m[i * 4 + 3];
> + d[i] = (a[i] +
> + n[i * 4 + 0] * m[i * 4 + 0] +
> + n[i * 4 + 1] * m[i * 4 + 1] +
> + n[i * 4 + 2] * m[i * 4 + 2] +
> + n[i * 4 + 3] * m[i * 4 + 3]);
> }
> clear_tail(d, opr_sz, simd_maxsz(desc));
> }
> -void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc)
> +void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t
> desc)
> {
> intptr_t i, opr_sz = simd_oprsz(desc);
> - uint64_t *d = vd;
> + int64_t *d = vd, *a = va;
Ditto.
> int16_t *n = vn, *m = vm;
>
> for (i = 0; i < opr_sz / 8; ++i) {
> - d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0]
> - + (int64_t)n[i * 4 + 1] * m[i * 4 + 1]
> - + (int64_t)n[i * 4 + 2] * m[i * 4 + 2]
> - + (int64_t)n[i * 4 + 3] * m[i * 4 + 3];
> + d[i] = (a[i] +
> + (int64_t)n[i * 4 + 0] * m[i * 4 + 0] +
> + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] +
> + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] +
> + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]);
> }
> clear_tail(d, opr_sz, simd_maxsz(desc));
> }
>
> -void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
> +void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm,
> + void *va, uint32_t desc)
> {
> intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
> intptr_t index = simd_data(desc);
> - uint32_t *d = vd;
> + int32_t *d = vd, *a = va;
> int8_t *n = vn;
> int8_t *m_indexed = (int8_t *)vm + H4(index) * 4;
Ditto.
> -void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
> +void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm,
> + void *va, uint32_t desc)
> {
> intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
> intptr_t index = simd_data(desc);
> - uint64_t *d = vd;
> + int64_t *d = vd, *a = va;
> int16_t *n = vn;
> int16_t *m_indexed = (int16_t *)vm + index * 4;
Ditto.
> @@ -509,30 +518,33 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void
> *vm, uint32_t desc)
> * Process the entire segment all at once, writing back the results
> * only after we've consumed all of the inputs.
> */
> - for (i = 0; i < opr_sz_8 ; i += 2) {
> - uint64_t d0, d1;
> + for (i = 0; i < opr_sz_8; i += 2) {
> + int64_t d0, d1;
Ditto.
>
> - d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
> + d0 = a[i + 0];
> + d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
> d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1];
> d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2];
> d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3];
> - d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
> +
> + d1 = a[i + 1];
> + d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
> d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1];
> d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2];
> d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3];
>
> - d[i + 0] += d0;
> - d[i + 1] += d1;
> + d[i + 0] = d0;
> + d[i + 1] = d1;
> }
> -
> clear_tail(d, opr_sz, simd_maxsz(desc));
> }
Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
thanks
-- PMM
- Re: [PATCH v6 48/82] target/arm: Pass separate addend to {U, S}DOT helpers,
Peter Maydell <=