[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PULL 30/35] util/bufferiszero: Reorganize for early test for accelerati
From: |
Richard Henderson |
Subject: |
[PULL 30/35] util/bufferiszero: Reorganize for early test for acceleration |
Date: |
Mon, 8 Apr 2024 07:49:24 -1000 |
From: Alexander Monakov <amonakov@ispras.ru>
Test for length >= 256 inline, where is is often a constant.
Before calling into the accelerated routine, sample three bytes
from the buffer, which handles most non-zero buffers.
Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
Message-Id: <20240206204809.9859-3-amonakov@ispras.ru>
[rth: Use __builtin_constant_p; move the indirect call out of line.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
include/qemu/cutils.h | 32 ++++++++++++++++-
util/bufferiszero.c | 84 +++++++++++++++++--------------------------
2 files changed, 63 insertions(+), 53 deletions(-)
diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index 92c927a6a3..741dade7cf 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -187,9 +187,39 @@ char *freq_to_str(uint64_t freq_hz);
/* used to print char* safely */
#define STR_OR_NULL(str) ((str) ? (str) : "null")
-bool buffer_is_zero(const void *buf, size_t len);
+/*
+ * Check if a buffer is all zeroes.
+ */
+
+bool buffer_is_zero_ool(const void *vbuf, size_t len);
+bool buffer_is_zero_ge256(const void *vbuf, size_t len);
bool test_buffer_is_zero_next_accel(void);
+static inline bool buffer_is_zero_sample3(const char *buf, size_t len)
+{
+ /*
+ * For any reasonably sized buffer, these three samples come from
+ * three different cachelines. In qemu-img usage, we find that
+ * each byte eliminates more than half of all buffer testing.
+ * It is therefore critical to performance that the byte tests
+ * short-circuit, so that we do not pull in additional cache lines.
+ * Do not "optimize" this to !(a | b | c).
+ */
+ return !buf[0] && !buf[len - 1] && !buf[len / 2];
+}
+
+#ifdef __OPTIMIZE__
+static inline bool buffer_is_zero(const void *buf, size_t len)
+{
+ return (__builtin_constant_p(len) && len >= 256
+ ? buffer_is_zero_sample3(buf, len) &&
+ buffer_is_zero_ge256(buf, len)
+ : buffer_is_zero_ool(buf, len));
+}
+#else
+#define buffer_is_zero buffer_is_zero_ool
+#endif
+
/*
* Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
* Input is limited to 14-bit numbers
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 641d5f9b9e..972f394cbd 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -26,8 +26,9 @@
#include "qemu/bswap.h"
#include "host/cpuinfo.h"
-static bool
-buffer_zero_int(const void *buf, size_t len)
+static bool (*buffer_is_zero_accel)(const void *, size_t);
+
+static bool buffer_is_zero_integer(const void *buf, size_t len)
{
if (unlikely(len < 8)) {
/* For a very small buffer, simply accumulate all the bytes. */
@@ -128,60 +129,38 @@ buffer_zero_avx2(const void *buf, size_t len)
}
#endif /* CONFIG_AVX2_OPT */
-/*
- * Make sure that these variables are appropriately initialized when
- * SSE2 is enabled on the compiler command-line, but the compiler is
- * too old to support CONFIG_AVX2_OPT.
- */
-#if defined(CONFIG_AVX2_OPT)
-# define INIT_USED 0
-# define INIT_LENGTH 0
-# define INIT_ACCEL buffer_zero_int
-#else
-# ifndef __SSE2__
-# error "ISA selection confusion"
-# endif
-# define INIT_USED CPUINFO_SSE2
-# define INIT_LENGTH 64
-# define INIT_ACCEL buffer_zero_sse2
-#endif
-
-static unsigned used_accel = INIT_USED;
-static unsigned length_to_accel = INIT_LENGTH;
-static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
-
static unsigned __attribute__((noinline))
select_accel_cpuinfo(unsigned info)
{
/* Array is sorted in order of algorithm preference. */
static const struct {
unsigned bit;
- unsigned len;
bool (*fn)(const void *, size_t);
} all[] = {
#ifdef CONFIG_AVX2_OPT
- { CPUINFO_AVX2, 128, buffer_zero_avx2 },
+ { CPUINFO_AVX2, buffer_zero_avx2 },
#endif
- { CPUINFO_SSE2, 64, buffer_zero_sse2 },
- { CPUINFO_ALWAYS, 0, buffer_zero_int },
+ { CPUINFO_SSE2, buffer_zero_sse2 },
+ { CPUINFO_ALWAYS, buffer_is_zero_integer },
};
for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
if (info & all[i].bit) {
- length_to_accel = all[i].len;
- buffer_accel = all[i].fn;
+ buffer_is_zero_accel = all[i].fn;
return all[i].bit;
}
}
return 0;
}
-#if defined(CONFIG_AVX2_OPT)
+static unsigned used_accel;
+
static void __attribute__((constructor)) init_accel(void)
{
used_accel = select_accel_cpuinfo(cpuinfo_init());
}
-#endif /* CONFIG_AVX2_OPT */
+
+#define INIT_ACCEL NULL
bool test_buffer_is_zero_next_accel(void)
{
@@ -194,36 +173,37 @@ bool test_buffer_is_zero_next_accel(void)
used_accel |= used;
return used;
}
-
-static bool select_accel_fn(const void *buf, size_t len)
-{
- if (likely(len >= length_to_accel)) {
- return buffer_accel(buf, len);
- }
- return buffer_zero_int(buf, len);
-}
-
#else
-#define select_accel_fn buffer_zero_int
bool test_buffer_is_zero_next_accel(void)
{
return false;
}
+
+#define INIT_ACCEL buffer_is_zero_integer
#endif
-/*
- * Checks if a buffer is all zeroes
- */
-bool buffer_is_zero(const void *buf, size_t len)
+static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL;
+
+bool buffer_is_zero_ool(const void *buf, size_t len)
{
if (unlikely(len == 0)) {
return true;
}
+ if (!buffer_is_zero_sample3(buf, len)) {
+ return false;
+ }
+ /* All bytes are covered for any len <= 3. */
+ if (unlikely(len <= 3)) {
+ return true;
+ }
- /* Fetch the beginning of the buffer while we select the accelerator. */
- __builtin_prefetch(buf);
-
- /* Use an optimized zero check if possible. Note that this also
- includes a check for an unrolled loop over 64-bit integers. */
- return select_accel_fn(buf, len);
+ if (likely(len >= 256)) {
+ return buffer_is_zero_accel(buf, len);
+ }
+ return buffer_is_zero_integer(buf, len);
+}
+
+bool buffer_is_zero_ge256(const void *buf, size_t len)
+{
+ return buffer_is_zero_accel(buf, len);
}
--
2.34.1
- [PULL 19/35] tcg: Add TCGContext.emit_before_op, (continued)
- [PULL 19/35] tcg: Add TCGContext.emit_before_op, Richard Henderson, 2024/04/08
- [PULL 20/35] accel/tcg: Add insn_start to DisasContextBase, Richard Henderson, 2024/04/08
- [PULL 21/35] target/arm: Use insn_start from DisasContextBase, Richard Henderson, 2024/04/08
- [PULL 23/35] target/i386: Preserve DisasContextBase.insn_start across rewind, Richard Henderson, 2024/04/08
- [PULL 24/35] target/microblaze: Use insn_start from DisasContextBase, Richard Henderson, 2024/04/08
- [PULL 25/35] target/riscv: Use insn_start from DisasContextBase, Richard Henderson, 2024/04/08
- [PULL 26/35] target/s390x: Use insn_start from DisasContextBase, Richard Henderson, 2024/04/08
- [PULL 27/35] accel/tcg: Improve can_do_io management, Richard Henderson, 2024/04/08
- [PULL 29/35] util/bufferiszero: Remove AVX512 variant, Richard Henderson, 2024/04/08
- [PULL 28/35] util/bufferiszero: Remove SSE4.1 variant, Richard Henderson, 2024/04/08
- [PULL 30/35] util/bufferiszero: Reorganize for early test for acceleration,
Richard Henderson <=
- [PULL 22/35] target/hppa: Use insn_start from DisasContextBase, Richard Henderson, 2024/04/08
- [PULL 33/35] util/bufferiszero: Improve scalar variant, Richard Henderson, 2024/04/08
- [PULL 31/35] util/bufferiszero: Remove useless prefetches, Richard Henderson, 2024/04/08
- [PULL 34/35] util/bufferiszero: Introduce biz_accel_fn typedef, Richard Henderson, 2024/04/08
- [PULL 35/35] util/bufferiszero: Simplify test_buffer_is_zero_next_accel, Richard Henderson, 2024/04/08
- [PULL 32/35] util/bufferiszero: Optimize SSE2 and AVX2 variants, Richard Henderson, 2024/04/08
- Re: [PULL 00/35] misc patch queue, Peter Maydell, 2024/04/09