qemu-arm
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] tcg: Special case split barriers before/after load


From: Philippe Mathieu-Daudé
Subject: Re: [PATCH] tcg: Special case split barriers before/after load
Date: Mon, 30 May 2022 17:10:47 +0200
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Thunderbird/91.9.1

Hi Richard,

On 1/5/22 01:45, Richard Henderson wrote:
When st:ld is not required by the guest but ld:st is, we can
put ld:ld+ld:st barriers after loads, and then st:st barriers
before stores to enforce all required barriers.

The st:st barrier is often special cased by hosts, and that
is expected to be more efficient than a full barrier.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

Redha, I expect this to produce exactly the same barriers as you
did with your 'fix guest memory ordering enforcement' patch.

While this compiles, it does not fix the failures that I see
occasionally with our private gitlab runner.  The standalone
version of this failure is

   export QTEST_QEMU_BINARY=./qemu-system-i386
   for i in `seq 1 100`; do
     ./tests/qtest/ahci-test > /dev/null &
   done
   wait

About 10 to 15% of the runs will fail with

ERROR:../src/tests/qtest/ahci-test.c:92:verify_state: assertion failed 
(ahci_fingerprint == ahci->fingerprint): (0xe0000000 == 0x29228086)

Note that this test never seems to fail unless the system is under
load, thus starting 100 tests on my 80 core neoverse-n1 system.


r~


---
  tcg/tcg-op.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-------
  1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 5d48537927..4c568a2592 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2834,9 +2834,6 @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, 
TCGv addr,
static void tcg_gen_req_mo(TCGBar type)
  {
-#ifdef TCG_GUEST_DEFAULT_MO
-    type &= TCG_GUEST_DEFAULT_MO;
-#endif
      type &= ~TCG_TARGET_DEFAULT_MO;
      if (type) {
          tcg_gen_mb(type | TCG_BAR_SC);
@@ -2868,12 +2865,49 @@ static void plugin_gen_mem_callbacks(TCGv vaddr, 
MemOpIdx oi,
  #endif
  }
+typedef enum {
+    BAR_LD_BEFORE,
+    BAR_LD_AFTER,
+    BAR_ST_BEFORE,
+} ChooseBarrier;
+
+static TCGBar choose_barrier(ChooseBarrier which)
+{
+#ifdef TCG_GUEST_DEFAULT_MO
+    const TCGBar guest_mo = TCG_GUEST_DEFAULT_MO;
+#else
+    const TCGBar guest_mo = TCG_MO_ALL;
+#endif
+    TCGBar ret[3];
+
+    if (guest_mo == 0) {
+        return 0;
+    }

This part ...:

+    /*
+     * Special case for i386 and s390x.  Because store-load is not
+     * required by the guest, we can split the barriers such that we
+     * wind up with a store-store barrier, which is expected to be
+     * quicker on some hosts.
+     */
+    if (guest_mo == (TCG_MO_ALL & ~TCG_MO_ST_LD)) {
+        ret[BAR_LD_BEFORE] = 0;
+        ret[BAR_LD_AFTER]  = TCG_MO_LD_LD | TCG_MO_LD_ST;
+        ret[BAR_ST_BEFORE] = TCG_MO_ST_ST;
+    } else {

... could deserve another patch.

+        ret[BAR_LD_BEFORE] = (TCG_MO_LD_LD | TCG_MO_ST_LD) & guest_mo;
+        ret[BAR_ST_BEFORE] = (TCG_MO_LD_ST | TCG_MO_ST_ST) & guest_mo;
+        ret[BAR_LD_AFTER]  = 0;
+    }
+    return ret[which];
+}
+
  void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
  {
      MemOp orig_memop;
      MemOpIdx oi;
- tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
+
      memop = tcg_canonicalize_memop(memop, 0, 0);
      oi = make_memop_idx(memop, idx);
@@ -2904,6 +2938,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
              g_assert_not_reached();
          }
      }
+
+    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
  }
void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -2911,7 +2947,8 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
      TCGv_i32 swap = NULL;
      MemOpIdx oi;
- tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
+    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
+
      memop = tcg_canonicalize_memop(memop, 0, 1);
      oi = make_memop_idx(memop, idx);
@@ -2959,7 +2996,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
          return;
      }
- tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
+
      memop = tcg_canonicalize_memop(memop, 1, 0);
      oi = make_memop_idx(memop, idx);
@@ -2994,6 +3032,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
              g_assert_not_reached();
          }
      }
+
+    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
  }
void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -3006,7 +3046,8 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
          return;
      }
- tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
+    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
+
      memop = tcg_canonicalize_memop(memop, 1, 1);
      oi = make_memop_idx(memop, idx);

Redha, could you test this patch?

Regards,

Phil.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]