[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v5 70/81] target/arm: Implement SVE2 LD1RO
From: |
Richard Henderson |
Subject: |
[PATCH v5 70/81] target/arm: Implement SVE2 LD1RO |
Date: |
Fri, 16 Apr 2021 14:02:29 -0700 |
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/sve.decode | 4 ++
target/arm/translate-sve.c | 97 ++++++++++++++++++++++++++++++++++++++
2 files changed, 101 insertions(+)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 17adb393ff..df870ce23b 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1077,11 +1077,15 @@ LD_zpri 1010010 .. nreg:2 0.... 111 ... .....
..... @rpri_load_msz
# SVE load and broadcast quadword (scalar plus scalar)
LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \
@rprr_load_msz nreg=0
+LD1RO_zprr 1010010 .. 01 ..... 000 ... ..... ..... \
+ @rprr_load_msz nreg=0
# SVE load and broadcast quadword (scalar plus immediate)
# LD1RQB, LD1RQH, LD1RQS, LD1RQD
LD1RQ_zpri 1010010 .. 00 0.... 001 ... ..... ..... \
@rpri_load_msz nreg=0
+LD1RO_zpri 1010010 .. 01 0.... 001 ... ..... ..... \
+ @rpri_load_msz nreg=0
# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
PRF 1000010 00 -1 ----- 0-- --- ----- 0 ----
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 04efa037f2..1cc98a1447 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -5586,6 +5586,103 @@ static bool trans_LD1RQ_zpri(DisasContext *s,
arg_rpri_load *a)
return true;
}
+static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned vsz_r32;
+ TCGv_ptr t_pg;
+ TCGv_i32 t_desc;
+ int desc, poff, doff;
+
+ if (vsz < 32) {
+ /*
+ * Note that this UNDEFINED check comes after CheckSVEEnabled()
+ * in the ARM pseudocode, which is the sve_access_check() done
+ * in our caller. We should not now return false from the caller.
+ */
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Load the first octaword using the normal predicated load helpers. */
+
+ poff = pred_full_reg_offset(s, pg);
+ if (vsz > 32) {
+ /*
+ * Zero-extend the first 32 bits of the predicate into a temporary.
+ * This avoids triggering an assert making sure we don't have bits
+ * set within a predicate beyond VQ, but we have lowered VQ to 2
+ * for this load operation.
+ */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+#ifdef HOST_WORDS_BIGENDIAN
+ poff += 4;
+#endif
+ tcg_gen_ld32u_i64(tmp, cpu_env, poff);
+
+ poff = offsetof(CPUARMState, vfp.preg_tmp);
+ tcg_gen_st_i64(tmp, cpu_env, poff);
+ tcg_temp_free_i64(tmp);
+ }
+
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_pg, cpu_env, poff);
+
+ desc = simd_desc(32, 32, zt);
+ t_desc = tcg_const_i32(desc);
+
+ gen_helper_gvec_mem *fn
+ = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
+ fn(cpu_env, t_pg, addr, t_desc);
+
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_i32(t_desc);
+
+ /*
+ * Replicate that first octaword.
+ * The replication happens in units of 32; if the full vector size
+ * is not a multiple of 32, the final bits are zeroed.
+ */
+ doff = vec_full_reg_offset(s, zt);
+ vsz_r32 = QEMU_ALIGN_DOWN(vsz, 32);
+ if (vsz >= 64) {
+ tcg_gen_gvec_dup_mem(5, doff + 32, doff, vsz_r32 - 32, vsz - 32);
+ } else if (vsz > vsz_r32) {
+ /* Nop move, with side effect of clearing the tail. */
+ tcg_gen_gvec_mov(MO_64, doff, doff, vsz_r32, vsz);
+ }
+}
+
+static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a)
+{
+ if (!dc_isar_feature(aa64_sve2_f64mm, s)) {
+ return false;
+ }
+ if (a->rm == 31) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_ldro(s, a->rd, a->pg, addr, a->dtype);
+ }
+ return true;
+}
+
+static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a)
+{
+ if (!dc_isar_feature(aa64_sve2_f64mm, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32);
+ do_ldro(s, a->rd, a->pg, addr, a->dtype);
+ }
+ return true;
+}
+
/* Load and broadcast element. */
static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a)
{
--
2.25.1
- [PATCH v5 60/81] target/arm: Implement SVE mixed sign dot product, (continued)
- [PATCH v5 60/81] target/arm: Implement SVE mixed sign dot product, Richard Henderson, 2021/04/16
- [PATCH v5 68/81] target/arm: Implement SVE2 FLOGB, Richard Henderson, 2021/04/16
- [PATCH v5 69/81] target/arm: Share table of sve load functions, Richard Henderson, 2021/04/16
- [PATCH v5 62/81] target/arm: Implement SVE2 crypto destructive binary operations, Richard Henderson, 2021/04/16
- [PATCH v5 66/81] target/arm: Implement SVE2 FCVTLT, Richard Henderson, 2021/04/16
- [PATCH v5 53/81] target/arm: Implement SVE2 integer multiply-add (indexed), Richard Henderson, 2021/04/16
- [PATCH v5 59/81] target/arm: Implement SVE mixed sign dot product (indexed), Richard Henderson, 2021/04/16
- [PATCH v5 65/81] target/arm: Implement SVE2 FCVTNT, Richard Henderson, 2021/04/16
- [PATCH v5 67/81] target/arm: Implement SVE2 FCVTXNT, FCVTX, Richard Henderson, 2021/04/16
- [PATCH v5 63/81] target/arm: Implement SVE2 crypto constructive binary operations, Richard Henderson, 2021/04/16
- [PATCH v5 70/81] target/arm: Implement SVE2 LD1RO,
Richard Henderson <=
- [PATCH v5 71/81] target/arm: Implement 128-bit ZIP, UZP, TRN, Richard Henderson, 2021/04/16
- [PATCH v5 72/81] target/arm: Implement SVE2 bitwise shift immediate, Richard Henderson, 2021/04/16
- [PATCH v5 73/81] target/arm: Implement SVE2 fp multiply-add long, Richard Henderson, 2021/04/16
- [PATCH v5 75/81] target/arm: Split out do_neon_ddda_fpst, Richard Henderson, 2021/04/16
- [PATCH v5 74/81] target/arm: Implement aarch64 SUDOT, USDOT, Richard Henderson, 2021/04/16
- [PATCH v5 78/81] target/arm: Split decode of VSDOT and VUDOT, Richard Henderson, 2021/04/16
- [PATCH v5 76/81] target/arm: Remove unused fpst from VDOT_scalar, Richard Henderson, 2021/04/16
- [PATCH v5 77/81] target/arm: Fix decode for VDOT (indexed), Richard Henderson, 2021/04/16
- [PATCH v5 79/81] target/arm: Implement aarch32 VSUDOT, VUSDOT, Richard Henderson, 2021/04/16
- [PATCH v5 80/81] target/arm: Implement integer matrix multiply accumulate, Richard Henderson, 2021/04/16