[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v2] cputlb: Make store_helper less fragile to compiler optimizati
From: |
Shu-Chun Weng |
Subject: |
[PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations |
Date: |
Fri, 24 Jul 2020 12:51:28 -0700 |
This change has no functional change.
There is a potential link error with "undefined symbol:
qemu_build_not_reached" due to how `store_helper` is structured.
This does not produce at current QEMU head, but was reproducible at
v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
The current function structure is:
inline QEMU_ALWAYSINLINE
store_memop() {
switch () {
...
default:
qemu_build_not_reached();
}
}
inline QEMU_ALWAYSINLINE
store_helper() {
...
if (span_two_pages_or_io) {
...
helper_rst_stb_mmu();
}
store_memop();
}
helper_rst_stb_mmu() {
store_helper();
}
Both `store_memop` and `store_helper` need to be inlined and allow
constant propogations to eliminate the `qemu_build_not_reached` call.
QEMU_ALWAYSINLINE is a valiant effort to make that happen, but it's
still not guaranteed to be inlined. What happens with the failing
combination is that the compiler decided to inline the
`helper_rst_stb_mmu` call inside `store_helper`, making the latter
self-recursive, thus prevents constant propagations.
The new structure is:
inline QEMU_ALWAYSINLINE
store_memop() {
switch () {
...
default:
qemu_build_not_reached();
}
}
inline QEMU_ALWAYSINLINE
store_helper_size_aligned()() {
...
if (span_two_pages_or_io) {
return false;
}
store_memoop();
return true;
}
inline QEMU_ALWAYSINLINE
store_helper() {
if (store_helper_size_aligned() {
return;
}
helper_rst_stb_mmu();
}
helper_rst_stb_mmu() {
store_helper_size_aligned()();
}
Since a byte store cannot span across more than one page, this is a
no-op. Moreover, there is no more recursion making it more robust
against potential optimizations.
Signed-off-by: Shu-Chun Weng <scw@google.com>
---
v1: https://patchew.org/QEMU/20200318062057.224953-1-scw@google.com/
v1 -> v2:
Restructure the function instead of marking `helper_rst_stb_mmu` noinline.
accel/tcg/cputlb.c | 158 ++++++++++++++++++++++++++-------------------
1 file changed, 92 insertions(+), 66 deletions(-)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index d370aedb47..14324f08d2 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2010,9 +2010,10 @@ store_memop(void *haddr, uint64_t val, MemOp op)
}
}
-static inline void QEMU_ALWAYS_INLINE
-store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
- TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
+/* Returns false if the store is not size-aligned and no store is done. */
+static inline bool QEMU_ALWAYS_INLINE
+store_helper_size_aligned(CPUArchState *env, target_ulong addr, uint64_t val,
+ TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
{
uintptr_t mmu_idx = get_mmuidx(oi);
uintptr_t index = tlb_index(env, mmu_idx, addr);
@@ -2048,7 +2049,7 @@ store_helper(CPUArchState *env, target_ulong addr,
uint64_t val,
/* For anything that is unaligned, recurse through byte stores. */
if ((addr & (size - 1)) != 0) {
- goto do_unaligned_access;
+ return false;
}
iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
@@ -2066,12 +2067,12 @@ store_helper(CPUArchState *env, target_ulong addr,
uint64_t val,
if (tlb_addr & TLB_MMIO) {
io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
op ^ (need_swap * MO_BSWAP));
- return;
+ return true;
}
/* Ignore writes to ROM. */
if (unlikely(tlb_addr & TLB_DISCARD_WRITE)) {
- return;
+ return true;
}
/* Handle clean RAM pages. */
@@ -2091,82 +2092,107 @@ store_helper(CPUArchState *env, target_ulong addr,
uint64_t val,
} else {
store_memop(haddr, val, op);
}
- return;
+ return true;
}
- /* Handle slow unaligned access (it spans two pages or IO). */
if (size > 1
&& unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
>= TARGET_PAGE_SIZE)) {
- int i;
- uintptr_t index2;
- CPUTLBEntry *entry2;
- target_ulong page2, tlb_addr2;
- size_t size2;
+ /* Slow unaligned access (it spans two pages or IO). */
+ return false;
+ }
- do_unaligned_access:
- /*
- * Ensure the second page is in the TLB. Note that the first page
- * is already guaranteed to be filled, and that the second page
- * cannot evict the first.
- */
- page2 = (addr + size) & TARGET_PAGE_MASK;
- size2 = (addr + size) & ~TARGET_PAGE_MASK;
- index2 = tlb_index(env, mmu_idx, page2);
- entry2 = tlb_entry(env, mmu_idx, page2);
- tlb_addr2 = tlb_addr_write(entry2);
- if (!tlb_hit_page(tlb_addr2, page2)) {
- if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
- tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
- mmu_idx, retaddr);
- index2 = tlb_index(env, mmu_idx, page2);
- entry2 = tlb_entry(env, mmu_idx, page2);
- }
- tlb_addr2 = tlb_addr_write(entry2);
- }
+ haddr = (void *)((uintptr_t)addr + entry->addend);
+ store_memop(haddr, val, op);
+ return true;
+}
- /*
- * Handle watchpoints. Since this may trap, all checks
- * must happen before any store.
- */
- if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
- cpu_check_watchpoint(env_cpu(env), addr, size - size2,
- env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
- BP_MEM_WRITE, retaddr);
- }
- if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
- cpu_check_watchpoint(env_cpu(env), page2, size2,
- env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
- BP_MEM_WRITE, retaddr);
- }
+static inline void QEMU_ALWAYS_INLINE
+store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+ TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
+{
+ uintptr_t mmu_idx;
+ uintptr_t index;
+ CPUTLBEntry *entry;
+ target_ulong tlb_addr;
+ const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
+ size_t size;
+ int i;
+ uintptr_t index2;
+ CPUTLBEntry *entry2;
+ target_ulong page2, tlb_addr2;
+ size_t size2;
- /*
- * XXX: not efficient, but simple.
- * This loop must go in the forward direction to avoid issues
- * with self-modifying code in Windows 64-bit.
- */
- for (i = 0; i < size; ++i) {
- uint8_t val8;
- if (memop_big_endian(op)) {
- /* Big-endian extract. */
- val8 = val >> (((size - 1) * 8) - (i * 8));
- } else {
- /* Little-endian extract. */
- val8 = val >> (i * 8);
- }
- helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
- }
+ if (store_helper_size_aligned(env, addr, val, oi, retaddr, op)) {
return;
}
- haddr = (void *)((uintptr_t)addr + entry->addend);
- store_memop(haddr, val, op);
+ mmu_idx = get_mmuidx(oi);
+
+ size = memop_size(op);
+ index = tlb_index(env, mmu_idx, addr);
+ entry = tlb_entry(env, mmu_idx, addr);
+ tlb_addr = tlb_addr_write(entry);
+
+ /* Handle slow unaligned access (it spans two pages or IO). */
+ /*
+ * Ensure the second page is in the TLB. Note that the first page
+ * is already guaranteed to be filled, and that the second page
+ * cannot evict the first.
+ */
+ page2 = (addr + size) & TARGET_PAGE_MASK;
+ size2 = (addr + size) & ~TARGET_PAGE_MASK;
+ index2 = tlb_index(env, mmu_idx, page2);
+ entry2 = tlb_entry(env, mmu_idx, page2);
+ tlb_addr2 = tlb_addr_write(entry2);
+ if (!tlb_hit_page(tlb_addr2, page2)) {
+ if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
+ tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
+ mmu_idx, retaddr);
+ index2 = tlb_index(env, mmu_idx, page2);
+ entry2 = tlb_entry(env, mmu_idx, page2);
+ }
+ tlb_addr2 = tlb_addr_write(entry2);
+ }
+
+ /*
+ * Handle watchpoints. Since this may trap, all checks
+ * must happen before any store.
+ */
+ if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, size - size2,
+ env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+ BP_MEM_WRITE, retaddr);
+ }
+ if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), page2, size2,
+ env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+ BP_MEM_WRITE, retaddr);
+ }
+
+ /*
+ * XXX: not efficient, but simple.
+ * This loop must go in the forward direction to avoid issues
+ * with self-modifying code in Windows 64-bit.
+ */
+ for (i = 0; i < size; ++i) {
+ uint8_t val8;
+ if (memop_big_endian(op)) {
+ /* Big-endian extract. */
+ val8 = val >> (((size - 1) * 8) - (i * 8));
+ } else {
+ /* Little-endian extract. */
+ val8 = val >> (i * 8);
+ }
+ helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
+ }
}
void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
TCGMemOpIdx oi, uintptr_t retaddr)
{
- store_helper(env, addr, val, oi, retaddr, MO_UB);
+ /* Byte store is always size-aligned. */
+ store_helper_size_aligned(env, addr, val, oi, retaddr, MO_UB);
}
void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
--
2.28.0.rc0.142.g3c755180ce-goog
- [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations,
Shu-Chun Weng <=