>From 07057dbcd61ab4cbda4bef110bb30c70f4d7f22f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristoffer=20Br=C3=A5nemyr?= Date: Sat, 20 Feb 2021 12:27:17 +0100 Subject: [PATCH 1/2] wc: use avx2 optimization when counting only lines Use cpuid to detect CPU support for avx2 instructions. Performance was seen to improve by 5x for a file with only newlines, while the performance for a file with no such characters is unchanged. * configure.ac [USE_AVX2_WC_LINECOUNT]: A new conditional, set when __get_cpuid_count() and avx2 compiler intrinsics are supported. * src/wc.c (avx2_supported): A new function using __get_cpuid_count() to determine if avx2 instructions are supported. (wc_lines): A new function refactored from wc(), which implements the standard line counting logic, and provides the fallback implementation for when avx2 is not supported. * src/wc_avx2.c: A new module to implement using avx2 intrinsics. * src/local.mk: Reference the new module. Note we build as a separate lib so that it can be portably built with separate -mavx2 etc. flags. --- configure.ac | 49 ++++++++++++++++ src/local.mk | 9 +++ src/wc.c | 157 ++++++++++++++++++++++++++++++++++++-------------- src/wc_avx2.c | 122 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 294 insertions(+), 43 deletions(-) create mode 100644 src/wc_avx2.c diff --git a/configure.ac b/configure.ac index 02291a4ae..f0fbbd9b7 100644 --- a/configure.ac +++ b/configure.ac @@ -575,6 +575,55 @@ AM_CONDITIONAL([USE_PCLMUL_CRC32], test "x$pclmul_intrinsic_exists" = "xyes"]) CFLAGS=$ac_save_CFLAGS +AC_MSG_CHECKING([if __get_cpuid_count exists]) +AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([[ + #include + + int main(void) + { + unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0; + __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + return 1; + } + ]]) + ],[ + AC_MSG_RESULT([yes]) + get_cpuid_count_exists=yes + ],[ + AC_MSG_RESULT([no]) + ]) + +CFLAGS="-mavx2 $CFLAGS" +AC_MSG_CHECKING([if avx2 intrinstics exists]) +AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([[ + #include + + int main(void) + { + __m256i a, b; + a = _mm256_sad_epu8(a, b); + return 1; + } + ]]) + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX2_INTRINSIC], [1], [avx2 intrinsics exists]) + avx2_intrinsic_exists=yes + ],[ + AC_MSG_RESULT([no]) + ]) +if test "x$get_cpuid_count_exists" = "xyes" && + test "x$avx2_intrinsic_exists" = "xyes"; then + AC_DEFINE([USE_AVX2_WC_LINECOUNT], [1], [Counting lines with AVX2 enabled]) +fi +AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT], + [test "x$get_cpuid_count_exists" = "xyes" && + test "x$avx2_intrinsic_exists" = "xyes"]) + +CFLAGS=$ac_save_CFLAGS + ############################################################################ dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script. diff --git a/src/local.mk b/src/local.mk index 8c8479a53..c6555dafb 100644 --- a/src/local.mk +++ b/src/local.mk @@ -427,6 +427,15 @@ src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS) src_expand_SOURCES = src/expand.c src/expand-common.c src_unexpand_SOURCES = src/unexpand.c src/expand-common.c +src_wc_SOURCES = src/wc.c +if USE_AVX2_WC_LINECOUNT +noinst_LIBRARIES += src/libwc_avx2.a +src_libwc_avx2_a_SOURCES = src/wc_avx2.c +wc_avx2_ldadd = src/libwc_avx2.a +src_wc_LDADD += $(wc_avx2_ldadd) +src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS) +endif + # Ensure we don't link against libcoreutils.a as that lib is # not compiled with -fPIC which causes issues on 64 bit at least src_libstdbuf_so_LDADD = $(LIBINTL) diff --git a/src/wc.c b/src/wc.c index d635e5214..35a865719 100644 --- a/src/wc.c +++ b/src/wc.c @@ -37,6 +37,9 @@ #include "safe-read.h" #include "stat-size.h" #include "xbinary-io.h" +#ifdef USE_AVX2_WC_LINECOUNT +# include +#endif #if !defined iswspace && !HAVE_ISWSPACE # define iswspace(wc) \ @@ -53,6 +56,20 @@ /* Size of atomic reads. */ #define BUFFER_SIZE (16 * 1024) +static bool +wc_lines (char const *file, int fd, uintmax_t *lines_out, + uintmax_t *bytes_out); +#ifdef USE_AVX2_WC_LINECOUNT +/* From wc_avx2.c */ +extern bool +wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, + uintmax_t *bytes_out); +#endif +static bool +(*wc_lines_p) (char const *file, int fd, uintmax_t *lines_out, + uintmax_t *bytes_out) = wc_lines; + + /* Cumulative number of lines, words, chars and bytes in all files so far. max_line_length is the maximum over all files processed so far. */ static uintmax_t total_lines; @@ -108,6 +125,33 @@ static struct option const longopts[] = {NULL, 0, NULL, 0} }; +#ifdef USE_AVX2_WC_LINECOUNT +static bool +avx2_supported (void) +{ + unsigned int eax = 0; + unsigned int ebx = 0; + unsigned int ecx = 0; + unsigned int edx = 0; + + if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx)) + return false; + + if (! (ecx & bit_OSXSAVE)) + return false; + + eax = ebx = ecx = edx = 0; + + if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)) + return false; + + if (! (ebx & bit_AVX2)) + return false; + + return true; +} +#endif + void usage (int status) { @@ -208,6 +252,70 @@ write_counts (uintmax_t lines, putchar ('\n'); } +static bool +wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) +{ + size_t bytes_read; + uintmax_t lines, bytes; + char buf[BUFFER_SIZE + 1]; + bool long_lines = false; + + if (!lines_out || !bytes_out) + { + return false; + } + + lines = bytes = 0; + + while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) + { + + if (bytes_read == SAFE_READ_ERROR) + { + error (0, errno, "%s", quotef (file)); + return false; + } + + bytes += bytes_read; + + char *p = buf; + char *end = buf + bytes_read; + uintmax_t plines = lines; + + if (! long_lines) + { + /* Avoid function call overhead for shorter lines. */ + while (p != end) + lines += *p++ == '\n'; + } + else + { + /* memchr is more efficient with longer lines. */ + while ((p = memchr (p, '\n', end - p))) + { + ++p; + ++lines; + } + } + + /* If the average line length in the block is >= 15, then use + memchr for the next block, where system specific optimizations + may outweigh function call overhead. + FIXME: This line length was determined in 2015, on both + x86_64 and ppc64, but it's worth re-evaluating in future with + newer compilers, CPUs, or memchr() implementations etc. */ + if (lines - plines <= bytes_read / 15) + long_lines = true; + else + long_lines = false; + } + + *bytes_out = bytes; + *lines_out = lines; + + return true; +} + /* Count words. FILE_X is the name of the file (or NULL for standard input) that is open on descriptor FD. *FSTATUS is its status. CURRENT_POS is the current file offset if known, negative if unknown. @@ -312,49 +420,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) { /* Use a separate loop when counting only lines or lines and bytes -- but not chars or words. */ - bool long_lines = false; - while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) - { - if (bytes_read == SAFE_READ_ERROR) - { - error (0, errno, "%s", quotef (file)); - ok = false; - break; - } - - bytes += bytes_read; - - char *p = buf; - char *end = p + bytes_read; - uintmax_t plines = lines; - - if (! long_lines) - { - /* Avoid function call overhead for shorter lines. */ - while (p != end) - lines += *p++ == '\n'; - } - else - { - /* memchr is more efficient with longer lines. */ - while ((p = memchr (p, '\n', end - p))) - { - ++p; - ++lines; - } - } - - /* If the average line length in the block is >= 15, then use - memchr for the next block, where system specific optimizations - may outweigh function call overhead. - FIXME: This line length was determined in 2015, on both - x86_64 and ppc64, but it's worth re-evaluating in future with - newer compilers, CPUs, or memchr() implementations etc. */ - if (lines - plines <= bytes_read / 15) - long_lines = true; - else - long_lines = false; - } + ok = wc_lines_p (file, fd, &lines, &bytes); } #if MB_LEN_MAX > 1 # define SUPPORT_OLD_MBRTOWC 1 @@ -706,6 +772,11 @@ main (int argc, char **argv) print_linelength = false; total_lines = total_words = total_chars = total_bytes = max_line_length = 0; +#ifdef USE_AVX2_WC_LINECOUNT + if (avx2_supported ()) + wc_lines_p = wc_lines_avx2; +#endif + while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1) switch (optc) { diff --git a/src/wc_avx2.c b/src/wc_avx2.c new file mode 100644 index 000000000..634c1bbb0 --- /dev/null +++ b/src/wc_avx2.c @@ -0,0 +1,122 @@ +/* wc_avx - Count the number of newlines with avx2 instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "system.h" +#include "error.h" +#include "safe-read.h" + +#include + +/* This must be below 16 KB (16384) or else the accumulators can + theoretically overflow, producing wrong result. This is 2*32 bytes below, + so there is no single bytes in the optimal case. */ +#define BUFSIZE (16320) + +extern bool +wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, + uintmax_t *bytes_out); + +extern bool +wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, + uintmax_t *bytes_out) +{ + __m256i accumulator; + __m256i accumulator2; + __m256i zeroes; + __m256i endlines; + __m256i avx_buf[BUFSIZE / sizeof (__m256i)]; + __m256i *datap; + uintmax_t lines = 0; + uintmax_t bytes = 0; + size_t bytes_read = 0; + + + if (!lines_out || !bytes_out) + return false; + + /* Using two parallel accumulators gave a good performance increase. + Adding a third gave no additional benefit, at least on an + Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector + execution engines it would be a win. */ + accumulator = _mm256_setzero_si256 (); + accumulator2 = _mm256_setzero_si256 (); + zeroes = _mm256_setzero_si256 (); + endlines = _mm256_set1_epi8 ('\n'); + + while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0) + { + __m256i to_match; + __m256i to_match2; + __m256i matches; + __m256i matches2; + + if (bytes_read == SAFE_READ_ERROR) + { + error (0, errno, "%s", quotef (file)); + return false; + } + + bytes += bytes_read; + + datap = avx_buf; + char *end = ((char *)avx_buf) + bytes_read; + + while (bytes_read >= 64) + { + to_match = _mm256_load_si256 (datap); + to_match2 = _mm256_load_si256 (datap + 1); + + matches = _mm256_cmpeq_epi8 (to_match, endlines); + matches2 = _mm256_cmpeq_epi8 (to_match2, endlines); + /* Compare will set each 8 bit integer in the register to 0xFF + on match. When we subtract it the 8 bit accumulators + will underflow, so this is equal to adding 1. */ + accumulator = _mm256_sub_epi8 (accumulator, matches); + accumulator2 = _mm256_sub_epi8 (accumulator2, matches2); + + datap += 2; + bytes_read -= 64; + } + + /* Horizontally add all 8 bit integers in the register, + and then reset it */ + accumulator = _mm256_sad_epu8 (accumulator, zeroes); + lines += _mm256_extract_epi16 (accumulator, 0) + + _mm256_extract_epi16 (accumulator, 4) + + _mm256_extract_epi16 (accumulator, 8) + + _mm256_extract_epi16 (accumulator, 12); + accumulator = _mm256_setzero_si256 (); + + accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes); + lines += _mm256_extract_epi16 (accumulator2, 0) + + _mm256_extract_epi16 (accumulator2, 4) + + _mm256_extract_epi16 (accumulator2, 8) + + _mm256_extract_epi16 (accumulator2, 12); + accumulator2 = _mm256_setzero_si256 (); + + /* Finish up any left over bytes */ + char *p = (char *)datap; + while (p != end) + lines += *p++ == '\n'; + } + + *lines_out = lines; + *bytes_out = bytes; + + return true; +} -- 2.26.2 >From 498a707873a33a99086fb38f0bfd821dd9795ed7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sat, 1 May 2021 20:02:02 +0100 Subject: [PATCH 2/2] wc: add --debug to diagnose which implementation used * src/wc.c: (main): Handle the new --debug option. Only call avx2_supported if needed. (avx2_supported): Diagnose various failures and attempts. * NEWS: Mention the new wc improvement and --debug option. --- NEWS | 4 ++++ src/wc.c | 67 ++++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 090fbc728..beb34bba5 100644 --- a/NEWS +++ b/NEWS @@ -96,6 +96,10 @@ GNU coreutils NEWS -*- outline -*- timeout now supports sub-second timeouts on macOS. + wc is up to 5 times faster when counting only new line characters, + where avx2 instructions are supported. + A new --debug option will indicate if avx2 is being used. + * Noteworthy changes in release 8.32 (2020-03-05) [stable] diff --git a/src/wc.c b/src/wc.c index 35a865719..bdb51928d 100644 --- a/src/wc.c +++ b/src/wc.c @@ -69,6 +69,7 @@ static bool (*wc_lines_p) (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) = wc_lines; +static bool debug; /* Cumulative number of lines, words, chars and bytes in all files so far. max_line_length is the maximum over all files processed so far. */ @@ -109,7 +110,8 @@ struct fstatus non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - FILES0_FROM_OPTION = CHAR_MAX + 1 + DEBUG_PROGRAM_OPTION = CHAR_MAX + 1, + FILES0_FROM_OPTION, }; static struct option const longopts[] = @@ -118,6 +120,7 @@ static struct option const longopts[] = {"chars", no_argument, NULL, 'm'}, {"lines", no_argument, NULL, 'l'}, {"words", no_argument, NULL, 'w'}, + {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION}, {"files0-from", required_argument, NULL, FILES0_FROM_OPTION}, {"max-line-length", no_argument, NULL, 'L'}, {GETOPT_HELP_OPTION_DECL}, @@ -133,22 +136,48 @@ avx2_supported (void) unsigned int ebx = 0; unsigned int ecx = 0; unsigned int edx = 0; + bool getcpuid_ok = false; + bool avx_enabled = false; - if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx)) - return false; - - if (! (ecx & bit_OSXSAVE)) - return false; + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)) + { + getcpuid_ok = true; + if (ecx & bit_OSXSAVE) + avx_enabled = true; /* Support is not disabled. */ + } - eax = ebx = ecx = edx = 0; - if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)) - return false; + if (avx_enabled) + { + eax = ebx = ecx = edx = 0; + if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)) + getcpuid_ok = false; + else + { + if (! (ebx & bit_AVX2)) + avx_enabled = false; /* Hardware doesn't support it. */ + } + } - if (! (ebx & bit_AVX2)) - return false; - return true; + if (! getcpuid_ok) + { + if (debug) + error (0, 0, "%s", _("failed to get cpuid")); + return false; + } + else if (! avx_enabled) + { + if (debug) + error (0, 0, "%s", _("avx2 support not detected")); + return false; + } + else + { + if (debug) + error (0, 0, "%s", _("using avx2 hardware support")); + return true; + } } #endif @@ -418,6 +447,11 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) } else if (!count_chars && !count_complicated) { +#ifdef USE_AVX2_WC_LINECOUNT + if (avx2_supported ()) + wc_lines_p = wc_lines_avx2; +#endif + /* Use a separate loop when counting only lines or lines and bytes -- but not chars or words. */ ok = wc_lines_p (file, fd, &lines, &bytes); @@ -772,11 +806,6 @@ main (int argc, char **argv) print_linelength = false; total_lines = total_words = total_chars = total_bytes = max_line_length = 0; -#ifdef USE_AVX2_WC_LINECOUNT - if (avx2_supported ()) - wc_lines_p = wc_lines_avx2; -#endif - while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1) switch (optc) { @@ -800,6 +829,10 @@ main (int argc, char **argv) print_linelength = true; break; + case DEBUG_PROGRAM_OPTION: + debug = true; + break; + case FILES0_FROM_OPTION: files_from = optarg; break; -- 2.26.2