qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v10 1/1] target/riscv: rvv: Use wider accesses for unit strid


From: Daniel Henrique Barboza
Subject: Re: [PATCH v10 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store
Date: Wed, 15 Jan 2025 12:06:21 -0300
User-agent: Mozilla Thunderbird



On 1/9/25 1:01 PM, Craig Blackmore wrote:
Use atomic load/store functions to access multiple elements from host.

Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Craig Blackmore <craig.blackmore@embecosm.com>
---

Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

  target/riscv/vector_helper.c | 95 +++++++++++++++++++++++++++++++++---
  1 file changed, 87 insertions(+), 8 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index a85dd1d200..f9228270b1 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -206,29 +206,108 @@ vext_continus_ldst_tlb(CPURISCVState *env, 
vext_ldst_elem_fn_tlb *ldst_tlb,
      }
  }
+#if !HOST_BIG_ENDIAN
+/* Atomic operations for load/store */
+
+#define GEN_VEXT_LDST_ATOMIC_HOST(SIZE, TYPE)                             \
+static inline QEMU_ALWAYS_INLINE void                                     \
+vext_ldst_atom_##SIZE##_host(void *vd, uint32_t byte_offset, TYPE *host,  \
+                             bool is_load)                                \
+{                                                                         \
+    TYPE *vd_ptr = (TYPE *) (vd + byte_offset);                           \
+    if (is_load) {                                                        \
+        *vd_ptr = qatomic_read__nocheck(host);                            \
+    } else {                                                              \
+        qatomic_set__nocheck(host, *vd_ptr);                              \
+    }                                                                     \
+}                                                                         \
+
+GEN_VEXT_LDST_ATOMIC_HOST(2, uint16_t)
+GEN_VEXT_LDST_ATOMIC_HOST(4, uint32_t)
+#ifdef CONFIG_ATOMIC64
+GEN_VEXT_LDST_ATOMIC_HOST(8, uint64_t)
+#endif
+
+static inline QEMU_ALWAYS_INLINE void
+vext_ldst_atom_16_host(void *vd, uint32_t byte_offset, Int128 *host,
+                       bool is_load)
+{
+    Int128 *vd_ptr = (Int128 *) (vd + byte_offset);
+    if (is_load) {
+        *vd_ptr = atomic16_read_ro(host);
+    } else {
+        atomic16_set(host, *vd_ptr);
+    }
+}
+#endif
+
  static inline QEMU_ALWAYS_INLINE void
  vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
                          void *vd, uint32_t evl, uint32_t reg_start, void 
*host,
-                        uint32_t esz, bool is_load)
+                        uint32_t esz, bool is_load, uint32_t log2_esz)
  {
  #if HOST_BIG_ENDIAN
      for (; reg_start < evl; reg_start++, host += esz) {
          ldst_host(vd, reg_start, host);
      }
  #else
-    if (esz == 1) {
-        uint32_t byte_offset = reg_start * esz;
-        uint32_t size = (evl - reg_start) * esz;
+    uint32_t size = (evl - reg_start) * esz;
+    uint32_t test = (uintptr_t) host;
+ /* Misaligned load/stores do not require any atomicity */
+    if (esz == 1 || unlikely(test % esz != 0)) {
+        uint32_t byte_offset = reg_start * esz;
          if (is_load) {
              memcpy(vd + byte_offset, host, size);
          } else {
              memcpy(host, vd + byte_offset, size);
          }
-    } else {
-        for (; reg_start < evl; reg_start++, host += esz) {
-            ldst_host(vd, reg_start, host);
+        return;
+    }
+
+    /*
+     * At this point esz > 1 and host is aligned to at least esz, so the ldst
+     * can be completed in atomic chunks of at least esz if the atomic 
operation
+     * is available.
+     */
+
+    /* Test that both alignment and size are multiples of the atomic width. */
+    test |= size;
+
+    /*
+     * If !HAVE_ATOMIC128_RO, then atomic16_set may be implemented with a
+     * 16-byte compare and store loop, which is expensive, so prefer two 8-byte
+     * stores in this case.
+     */
+    if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW)
+        && (test % 16 == 0)) {
+        for (; reg_start < evl; reg_start += 16 >> log2_esz, host += 16) {
+            vext_ldst_atom_16_host(vd, reg_start * esz, host, is_load);
+        }
+        return;
+    }
+#ifdef CONFIG_ATOMIC64
+    if (test % 8 == 0) {
+        for (; reg_start < evl; reg_start += 8 >> log2_esz, host += 8) {
+            vext_ldst_atom_8_host(vd, reg_start * esz, host, is_load);
+        }
+        return;
+    }
+#endif
+    if (test % 4 == 0) {
+        for (; reg_start < evl; reg_start += 4 >> log2_esz, host += 4) {
+            vext_ldst_atom_4_host(vd, reg_start * esz, host, is_load);
+        }
+        return;
+    }
+    if (test % 2 == 0) {
+        for (; reg_start < evl; reg_start += 2 >> log2_esz, host += 2) {
+            vext_ldst_atom_2_host(vd, reg_start * esz, host, is_load);
          }
+        return;
+    }
+    for (; reg_start < evl; reg_start++, host += esz) {
+        ldst_host(vd, reg_start, host);
      }
  #endif
  }
@@ -343,7 +422,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, 
target_ulong addr,
      if (flags == 0) {
          if (nf == 1) {
              vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, 
host,
-                                    esz, is_load);
+                                    esz, is_load, log2_esz);
          } else {
              for (i = env->vstart; i < evl; ++i) {
                  k = 0;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]