qemu-arm
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v4 20/45] target/arm: Implement SME LD1, ST1


From: Peter Maydell
Subject: Re: [PATCH v4 20/45] target/arm: Implement SME LD1, ST1
Date: Mon, 4 Jul 2022 11:39:47 +0100

On Tue, 28 Jun 2022 at 05:34, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We cannot reuse the SVE functions for LD[1-4] and ST[1-4],
> because those functions accept only a Zreg register number.
> For SME, we want to pass a pointer into ZA storage.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---


> +
> +/*
> + * Clear elements in a tile slice comprising len bytes.
> + */
> +
> +typedef void ClearFn(void *ptr, size_t off, size_t len);
> +
> +static void clear_horizontal(void *ptr, size_t off, size_t len)
> +{
> +    memset(ptr + off, 0, len);
> +}
> +
> +static void clear_vertical_b(void *vptr, size_t off, size_t len)
> +{
> +    uint8_t *ptr = vptr;
> +    size_t i;
> +
> +    for (i = 0; i < len; ++i) {
> +        ptr[(i + off) * sizeof(ARMVectorReg)] = 0;
> +    }
> +}
> +
> +static void clear_vertical_h(void *vptr, size_t off, size_t len)
> +{
> +    uint16_t *ptr = vptr;
> +    size_t i;
> +
> +    for (i = 0; i < len / 2; ++i) {
> +        ptr[(i + off) * sizeof(ARMVectorReg)] = 0;
> +    }

The code for the bigger-than-byte vertical actions seems wrong:
because 'ptr' is a uint16_t here this expression is mixing
byte offsets (off, the multiply by sizeof(ARMVectorReg) with
halfword offsets (i, the fact we're calculating an index value
for a uint16_t array).

> +}
> +
> +static void clear_vertical_s(void *vptr, size_t off, size_t len)
> +{
> +    uint32_t *ptr = vptr;
> +    size_t i;
> +
> +    for (i = 0; i < len / 4; ++i) {
> +        ptr[(i + off) * sizeof(ARMVectorReg)] = 0;
> +    }
> +}
> +
> +static void clear_vertical_d(void *vptr, size_t off, size_t len)
> +{
> +    uint64_t *ptr = vptr;
> +    size_t i;
> +
> +    for (i = 0; i < len / 8; ++i) {
> +        ptr[(i + off) * sizeof(ARMVectorReg)] = 0;
> +    }
> +}
> +
> +static void clear_vertical_q(void *vptr, size_t off, size_t len)
> +{
> +    Int128 *ptr = vptr, zero = int128_zero();
> +    size_t i;
> +
> +    for (i = 0; i < len / 16; ++i) {
> +        ptr[(i + off) * sizeof(ARMVectorReg)] = zero;
> +    }
> +}
> +
> +/*
> + * Copy elements from an array into a tile slice comprising len bytes.
> + */
> +
> +typedef void CopyFn(void *dst, const void *src, size_t len);
> +
> +static void copy_horizontal(void *dst, const void *src, size_t len)
> +{
> +    memcpy(dst, src, len);
> +}
> +
> +static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
> +{
> +    const uint8_t *src = vsrc;
> +    uint8_t *dst = vdst;
> +    size_t i;
> +
> +    for (i = 0; i < len; ++i) {
> +        dst[i * sizeof(ARMVectorReg)] = src[i];
> +    }
> +}
> +
> +static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
> +{
> +    const uint16_t *src = vsrc;
> +    uint16_t *dst = vdst;
> +    size_t i;
> +
> +    for (i = 0; i < len / 2; ++i) {
> +        dst[i * sizeof(ARMVectorReg)] = src[i];

Similarly the array index calculation for dst[] here looks wrong.

> +    }
> +}
> +
> +static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
> +{
> +    const uint32_t *src = vsrc;
> +    uint32_t *dst = vdst;
> +    size_t i;
> +
> +    for (i = 0; i < len / 4; ++i) {
> +        dst[i * sizeof(ARMVectorReg)] = src[i];
> +    }
> +}
> +
> +static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
> +{
> +    const uint64_t *src = vsrc;
> +    uint64_t *dst = vdst;
> +    size_t i;
> +
> +    for (i = 0; i < len / 8; ++i) {
> +        dst[i * sizeof(ARMVectorReg)] = src[i];
> +    }
> +}
> +
> +static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
> +{
> +    const Int128 *src = vsrc;
> +    Int128 *dst = vdst;
> +    size_t i;
> +
> +    for (i = 0; i < len / 16; ++i) {
> +        dst[i * sizeof(ARMVectorReg)] = src[i];
> +    }
> +}
> +
> +/*
> + * Host and TLB primitives for vertical tile slice addressing.
> + */
> +
> +#define DO_LD(NAME, TYPE, HOST, TLB)                                        \
> +static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
> +{                                                                           \
> +    TYPE val = HOST(host);                                                  \
> +    *(TYPE *)(za + off * sizeof(ARMVectorReg)) = val;                       \
> +}                                                                           \
> +static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
> +                        intptr_t off, target_ulong addr, uintptr_t ra)      \
> +{                                                                           \
> +    TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
> +    *(TYPE *)(za + off * sizeof(ARMVectorReg)) = val;                       \
> +}

So in these functions is 'za' pre-adjusted to the start address of the
vertical column? Is 'off' a byte offset here (in which case the
arithmetic is wrong for anything except byte columns) or has it
been adjusted in the caller to be the index within the column?
(I can't see anything doing the adjustment in the caller but I might
have got tangled up in the macros :-))

> +
> +#define DO_ST(NAME, TYPE, HOST, TLB)                                        \
> +static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
> +{                                                                           \
> +    TYPE val = *(TYPE *)(za + off * sizeof(ARMVectorReg));                  \
> +    HOST(host, val);                                                        \
> +}                                                                           \
> +static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
> +                        intptr_t off, target_ulong addr, uintptr_t ra)      \
> +{                                                                           \
> +    TYPE val = *(TYPE *)(za + off * sizeof(ARMVectorReg));                  \
> +    TLB(env, useronly_clean_ptr(addr), val, ra);                            \
> +}
> +
> +/*
> + * The ARMVectorReg elements are stored in host-endian 64-bit units.
> + * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
> + * corresponds to storing the two 64-bit pieces in little-endian order.
> + */
> +#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
> +static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
> +{                                                                           \
> +    uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
> +    uint64_t *ptr = za + off;                                               \
> +    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
> +}                                                                           \
> +static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
> +{                                                                           \
> +    HNAME##_host(za, off * sizeof(ARMVectorReg), host);                     \
> +}                                                                           \
> +static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
> +                               target_ulong addr, uintptr_t ra)             \
> +{                                                                           \
> +    uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
> +    uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
> +    uint64_t *ptr = za + off;                                               \
> +    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
> +}                                                                           \
> +static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
> +                               target_ulong addr, uintptr_t ra)             \
> +{                                                                           \
> +    HNAME##_tlb(env, za, off * sizeof(ARMVectorReg), addr, ra);             \
> +}
> +
> +#define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
> +static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
> +{                                                                           \
> +    uint64_t *ptr = za + off;                                               \
> +    HOST(host, ptr[BE]);                                                    \
> +    HOST(host + 1, ptr[!BE]);                                               \
> +}                                                                           \
> +static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
> +{                                                                           \
> +    HNAME##_host(za, off * sizeof(ARMVectorReg), host);                     \
> +}                                                                           \
> +static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
> +                               target_ulong addr, uintptr_t ra)             \
> +{                                                                           \
> +    uint64_t *ptr = za + off;                                               \
> +    TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
> +    TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
> +}                                                                           \
> +static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
> +                               target_ulong addr, uintptr_t ra)             \
> +{                                                                           \
> +    HNAME##_tlb(env, za, off * sizeof(ARMVectorReg), addr, ra);             \
> +}
> +

> +static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
> +{
> +    typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
> +
> +    /*
> +     * Indexed by [esz][be][v][mte][st], which is (except for load/store)
> +     * also the order in which the elements appear in the function names,
> +     * and so how we must concatenate the pieces.
> +     */
> +
> +#define FN_LS(F)     { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
> +#define FN_MTE(F)    { FN_LS(F), FN_LS(F##_mte) }
> +#define FN_HV(F)     { FN_MTE(F##_h), FN_MTE(F##_v) }
> +#define FN_END(L, B) { FN_HV(L), FN_HV(B) }
> +
> +    static GenLdSt1 * const fns[5][2][2][2][2] = {
> +        FN_END(b, b),
> +        FN_END(h_le, h_be),
> +        FN_END(s_le, s_be),
> +        FN_END(d_le, d_be),
> +        FN_END(q_le, q_be),
> +    };
> +
> +#undef FN_LS
> +#undef FN_MTE
> +#undef FN_HV
> +#undef FN_END
> +
> +    TCGv_ptr t_za, t_pg;
> +    TCGv_i64 addr;
> +    int svl, desc = 0;
> +    bool be = s->be_data == MO_BE;
> +    bool mte = s->mte_active[0];
> +
> +    if (!dc_isar_feature(aa64_sme, s)) {
> +        return false;
> +    }
> +    if (!sme_smza_enabled_check(s)) {
> +        return true;
> +    }
> +
> +    t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
> +    t_pg = pred_full_reg_ptr(s, a->pg);
> +    addr = tcg_temp_new_i64();
> +
> +    tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
> +    tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));

Theoretically we ought to call gen_check_sp_alignment() here
for rn == 31, but I guess we didn't do that for any of the
non-base-A64 instructions like SVE either.

> +
> +    if (mte) {
> +        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
> +        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
> +        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
> +        desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
> +        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
> +        desc <<= SVE_MTEDESC_SHIFT;
> +    } else {
> +        addr = clean_data_tbi(s, addr);
> +    }
> +    svl = streaming_vec_reg_size(s);
> +    desc = simd_desc(svl, svl, desc);
> +
> +    fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr,
> +                                      tcg_constant_i32(desc));
> +
> +    tcg_temp_free_ptr(t_za);
> +    tcg_temp_free_ptr(t_pg);
> +    tcg_temp_free_i64(addr);
> +    return true;
> +}
> --
> 2.34.1

thanks
-- PMM



reply via email to

[Prev in Thread] Current Thread [Next in Thread]