This patch optimizes the emulation of unit-stride load/store RVV
instructions
when the data being loaded/stored per iteration amounts to 64 bytes
or more.
The optimization consists of calling __builtin_memcpy on chunks of
data of 128
and 256 bytes between the memory address of the simulated vector
register and
the destination memory address and vice versa.
This is done only if we have direct access to the RAM of the host
machine.
Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
---
target/riscv/vector_helper.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 4b444c6bc5..7674972784 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -486,7 +486,22 @@ vext_group_ldst_host(CPURISCVState *env, void
*vd, uint32_t byte_end,
}
fn = fns[is_load][group_size];
- fn(vd, byte_offset, host + byte_offset);
+
+ if (byte_offset + 32 < byte_end) {
+ group_size = MO_256;
+ if (is_load)
+ __builtin_memcpy((uint8_t *)(vd + byte_offset), (uint8_t
*)(host + byte_offset), 32);
+ else
+ __builtin_memcpy((uint8_t *)(host + byte_offset), (uint8_t
*)(vd + byte_offset), 32);
+ } else if (byte_offset + 16 < byte_end) {
+ group_size = MO_128;
+ if (is_load)
+ __builtin_memcpy((uint8_t *)(vd + byte_offset), (uint8_t
*)(host + byte_offset), 16);
+ else
+ __builtin_memcpy((uint8_t *)(host + byte_offset), (uint8_t
*)(vd + byte_offset), 16);
+ } else {
+ fn(vd, byte_offset, host + byte_offset);
+ }