[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [RFC 1/1 v3] target/riscv: use tcg ops generation to emulate whole r
From: |
Alex Bennée |
Subject: |
Re: [RFC 1/1 v3] target/riscv: use tcg ops generation to emulate whole reg rvv loads/stores. |
Date: |
Wed, 22 Jan 2025 17:43:38 +0000 |
User-agent: |
mu4e 1.12.8; emacs 29.4 |
Paolo Savini <paolo.savini@embecosm.com> writes:
> This patch replaces the use of a helper function with direct tcg ops
> generation
> in order to emulate whole register loads and stores. This is done in order to
> improve the performance of QEMU.
Generally having the frontend second guess what the backend will do is
not recommended.
> We still use the helper function when vstart is not 0 at the beginning of the
> emulation of the whole register load or store or when we would end up
> generating
> partial loads or stores of vector elements (e.g. emulating 64 bits element
> loads
> with pairs of 32 bits loads on hosts with 32 bits registers).
> The latter condition ensures that we are not surprised by a trap in
> mid-element
> and consecutively that we can update vstart correctly.
This is what probe functions are for, so you can verify you won't fault
and then fully unroll the loop.
> We also use the helper function when it performs better than tcg for specific
> combinations of vector length, number of fields and element size.
>
> Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
> ---
> target/riscv/insn_trans/trans_rvv.c.inc | 164 +++++++++++++++++-------
> 1 file changed, 119 insertions(+), 45 deletions(-)
>
> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc
> b/target/riscv/insn_trans/trans_rvv.c.inc
> index b9883a5d32..85935276de 100644
> --- a/target/riscv/insn_trans/trans_rvv.c.inc
> +++ b/target/riscv/insn_trans/trans_rvv.c.inc
> @@ -1100,25 +1100,99 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op,
> ld_us_check)
> typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
>
> static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
> - gen_helper_ldst_whole *fn,
> - DisasContext *s)
> + uint32_t log2_esz, gen_helper_ldst_whole *fn,
> + DisasContext *s, bool is_load)
> {
> - TCGv_ptr dest;
> - TCGv base;
> - TCGv_i32 desc;
> + mark_vs_dirty(s);
>
> - uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
> - data = FIELD_DP32(data, VDATA, VM, 1);
> - dest = tcg_temp_new_ptr();
> - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> - s->cfg_ptr->vlenb, data));
> + uint32_t vlen = s->cfg_ptr->vlenb << 3;
>
> - base = get_gpr(s, rs1, EXT_NONE);
> - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> + /*
> + * Load/store multiple bytes per iteration.
> + * When possible do this atomically.
> + * Update vstart with the number of processed elements.
> + * Use the helper function if either:
> + * - vstart is not 0.
> + * - the target has 32 bit registers and we are loading/storing 64 bit
> long
> + * elements. This is to ensure that we process every element with a
> single
> + * memory instruction.
> + * - whether the helper function performs better:
> + * on x86 the helper function performs better with few combinations of
> NF,
> + * ESZ and VLEN.
> + * Other architectures may have other combinations or conditions and
> they
> + * can be added here if necessary.
> + */
>
> - mark_vs_dirty(s);
> + bool use_helper_fn = !s->vstart_eq_zero || (TCG_TARGET_REG_BITS == 32 &&
> log2_esz == 3);
> +
> +#if defined(HOST_X86_64)
> + use_helper_fn |= ((nf == 4) && (log2_esz == 0) && (vlen == 1024)) ||
> + ((nf == 8) && (log2_esz == 0) && (vlen == 512)) ||
> + ((nf == 8) && (log2_esz == 0) && (vlen == 1024)) ||
> + ((nf == 8) && (log2_esz == 3) && (vlen == 1024));
> +#endif
Using host architecture ifdefs is generally discouraged except in a few places.
>
> - fn(dest, base, tcg_env, desc);
> + if (!use_helper_fn) {
> + TCGv addr = tcg_temp_new();
> + uint32_t size = s->cfg_ptr->vlenb * nf;
> + TCGv_i64 t8 = tcg_temp_new_i64();
> + TCGv_i32 t4 = tcg_temp_new_i32();
> + MemOp atomicity = MO_ATOM_NONE;
> + if (log2_esz == 0) {
> + atomicity = MO_ATOM_NONE;
> + } else {
> + atomicity = MO_ATOM_IFALIGN_PAIR;
> + }
> + if (TCG_TARGET_REG_BITS == 64) {
> + for (int i = 0; i < size; i += 8) {
> + addr = get_address(s, rs1, i);
> + if (is_load) {
> + tcg_gen_qemu_ld_i64(t8, addr, s->mem_idx,
> + MO_LE | MO_64 | atomicity);
> + tcg_gen_st_i64(t8, tcg_env, vreg_ofs(s, vd) + i);
> + } else {
> + tcg_gen_ld_i64(t8, tcg_env, vreg_ofs(s, vd) + i);
> + tcg_gen_qemu_st_i64(t8, addr, s->mem_idx,
> + MO_LE | MO_64 | atomicity);
> + }
> + if (i == size - 8) {
> + tcg_gen_movi_tl(cpu_vstart, 0);
> + } else {
> + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 8 >> log2_esz);
> + }
> + }
> + } else {
> + for (int i = 0; i < size; i += 4) {
> + addr = get_address(s, rs1, i);
> + if (is_load) {
> + tcg_gen_qemu_ld_i32(t4, addr, s->mem_idx,
> + MO_LE | MO_32 | atomicity);
> + tcg_gen_st_i32(t4, tcg_env, vreg_ofs(s, vd) + i);
> + } else {
> + tcg_gen_ld_i32(t4, tcg_env, vreg_ofs(s, vd) + i);
> + tcg_gen_qemu_st_i32(t4, addr, s->mem_idx,
> + MO_LE | MO_32 | atomicity);
> + }
> + if (i == size - 4) {
> + tcg_gen_movi_tl(cpu_vstart, 0);
> + } else {
> + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 4 >> log2_esz);
> + }
> + }
> + }
> + } else {
> + TCGv_ptr dest;
> + TCGv base;
> + TCGv_i32 desc;
> + uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
> + data = FIELD_DP32(data, VDATA, VM, 1);
> + dest = tcg_temp_new_ptr();
> + desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> + s->cfg_ptr->vlenb, data));
> + base = get_gpr(s, rs1, EXT_NONE);
> + tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> + fn(dest, base, tcg_env, desc);
> + }
>
> finalize_rvv_inst(s);
> return true;
> @@ -1128,42 +1202,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t
> rs1, uint32_t nf,
> * load and store whole register instructions ignore vtype and vl setting.
> * Thus, we don't need to check vill bit. (Section 7.9)
> */
> -#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF) \
> -static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
> -{ \
> - if (require_rvv(s) && \
> - QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
> - return ldst_whole_trans(a->rd, a->rs1, ARG_NF, \
> - gen_helper_##NAME, s); \
> - } \
> - return false; \
> -}
> -
> -GEN_LDST_WHOLE_TRANS(vl1re8_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl2re8_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl4re8_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl8re8_v, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
> +#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD) \
> +static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
> +{ \
> + if (require_rvv(s) && \
> + QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
> + return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \
> + gen_helper_##NAME, s, IS_LOAD); \
> + } \
> + return false; \
> +}
> +
> +GEN_LDST_WHOLE_TRANS(vl1re8_v, int8_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl2re8_v, int8_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl4re8_v, int8_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl8re8_v, int8_t, 8, true)
> +GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true)
> +GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true)
> +GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true)
>
> /*
> * The vector whole register store instructions are encoded similar to
> * unmasked unit-stride store of elements with EEW=8.
> */
> -GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
> -GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
> -GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
> -GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
> +GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false)
> +GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false)
> +GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false)
> +GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
>
> /*
> *** Vector Integer Arithmetic Instructions
--
Alex Bennée
Virtualisation Tech Lead @ Linaro