From ffce0cb65c6eea926b0efbdb6b0a1a665ef4d317 Mon Sep 17 00:00:00 2001 From: Evgeny Nizhibitsky Date: Sun, 31 Mar 2024 12:23:32 +0100 Subject: [PATCH] wc: speed-up by simplifying AVX code * src/wc_avx2.c (_mm256_sad_epu8): Change from _mm256_sub_epi8() + _mm256_sad_epu8() to _mm256_movemask_epi8() + __builtin_popcount(). Also adjust the I/O size from 16 to 256 KiB. --- src/wc_avx2.c | 56 +++++++++++---------------------------------------- 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/src/wc_avx2.c b/src/wc_avx2.c index cc0454a46..4afc28a8e 100644 --- a/src/wc_avx2.c +++ b/src/wc_avx2.c @@ -22,10 +22,9 @@ #include -/* This must be below 16 KB (16384) or else the accumulators can - theoretically overflow, producing wrong result. This is 2*32 bytes below, - so there is no single bytes in the optimal case. */ -#define BUFSIZE (16320) +/* 256 KB buffer delivers 10-15% speedup over the old 16 KB one on 7800X3D. + The speedup beyond this buffer size was negligible. */ +#define BUFSIZE (256 * 1024) /* Read FD and return a summary. */ extern struct wc_lines @@ -34,21 +33,11 @@ wc_lines_avx2 (int fd) intmax_t lines = 0; intmax_t bytes = 0; - __m256i - zeroes = _mm256_setzero_si256 (), - endlines = _mm256_set1_epi8 ('\n'); + __m256i endlines = _mm256_set1_epi8 ('\n'); while (true) { - /* Using two parallel accumulators gave a good performance increase. - Adding a third gave no additional benefit, at least on an - Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector - execution engines it would be a win. */ - __m256i - accumulator = _mm256_setzero_si256 (), - accumulator2 = _mm256_setzero_si256 (), - avx_buf[BUFSIZE / sizeof (__m256i)]; - + __m256i avx_buf[BUFSIZE / sizeof (__m256i)]; ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf); if (bytes_read <= 0) return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; @@ -56,37 +45,16 @@ wc_lines_avx2 (int fd) bytes += bytes_read; __m256i *datap = avx_buf; - while (bytes_read >= 64) + while (bytes_read >= 32) { - __m256i - to_match = _mm256_load_si256 (datap), - to_match2 = _mm256_load_si256 (datap + 1), - matches = _mm256_cmpeq_epi8 (to_match, endlines), - matches2 = _mm256_cmpeq_epi8 (to_match2, endlines); - - /* Compare will set each 8 bit integer in the register to 0xFF - on match. When we subtract it the 8 bit accumulators - will underflow, so this is equal to adding 1. */ - accumulator = _mm256_sub_epi8 (accumulator, matches); - accumulator2 = _mm256_sub_epi8 (accumulator2, matches2); - - datap += 2; - bytes_read -= 64; + __m256i to_match = _mm256_load_si256 (datap); + __m256i matches = _mm256_cmpeq_epi8 (to_match, endlines); + int mask = _mm256_movemask_epi8 (matches); + lines += __builtin_popcount (mask); + datap += 1; + bytes_read -= 32; } - /* Horizontally add all 8 bit integers in the register. */ - accumulator = _mm256_sad_epu8 (accumulator, zeroes); - lines += _mm256_extract_epi16 (accumulator, 0) - + _mm256_extract_epi16 (accumulator, 4) - + _mm256_extract_epi16 (accumulator, 8) - + _mm256_extract_epi16 (accumulator, 12); - - accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes); - lines += _mm256_extract_epi16 (accumulator2, 0) - + _mm256_extract_epi16 (accumulator2, 4) - + _mm256_extract_epi16 (accumulator2, 8) - + _mm256_extract_epi16 (accumulator2, 12); - /* Finish up any left over bytes */ char *end = (char *) datap + bytes_read; for (char *p = (char *) datap; p < end; p++) -- 2.44.0