From b0cb0ed373f02ede2046fa79a64e276e5bc8d269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?=
Date: Thu, 17 May 2018 21:41:46 -0700 Subject: [PATCH] wc: optimize processing of ASCII in multi byte locales MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ===== Benchmark setup (on GNU/Linux) ==== $ yes áááááááááááááááááááá | head -n100000 > mbc.txt $ yes 12345678901234567890 | head -n100000 > num.txt ===== Before ==== $ time src/wc -Lm < mbc.txt real 0m0.186s $ time src/wc -m < mbc.txt real 0m0.186s $ time src/wc -Lm < num.txt real 0m0.055s $ time src/wc -m < num.txt real 0m0.056s ==== After ==== $ time src/wc -Lm < mbc.txt real 0m0.196s $ time src/wc -m < mbc.txt real 0m0.173s $ time src/wc -Lm < num.txt real 0m0.031s $ time src/wc -m < num.txt real 0m0.028s * src/wc.c (wc): Only call wide variant functions like iswprint() and wcwidth() for non is_basic() characters. I.E. non ISO C "basic character set" characters. This is especially significant on OSX where wcwidth() is very expensive (about 10x in tests). * NEWS: Mention the improvement. Suggested by Eric Fishcer. --- NEWS | 3 +++ src/wc.c | 30 +++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 4c63b6d..f447d6d 100644 --- a/NEWS +++ b/NEWS @@ -53,6 +53,9 @@ GNU coreutils NEWS -*- outline -*- version of XFS. stat -f --format=%T now reports the file system type, and tail -f uses inotify. + wc avoids redundant processing of ASCII text in multibyte locales, + which is especially significant on macOS. + * Noteworthy changes in release 8.29 (2017-12-27) [stable] diff --git a/src/wc.c b/src/wc.c index 0c72042..2034c42 100644 --- a/src/wc.c +++ b/src/wc.c @@ -379,6 +379,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) { wchar_t wide_char; size_t n; + bool wide = true; if (!in_shift && is_basic (*p)) { @@ -386,6 +387,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) mbrtowc(). */ n = 1; wide_char = *p; + wide = false; } else { @@ -419,9 +421,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) n = 1; } } - p += n; - bytes_read -= n; - chars++; + switch (wide_char) { case '\n': @@ -445,17 +445,33 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) in_word = false; break; default: - if (iswprint (wide_char)) + if (wide && iswprint (wide_char)) { - int width = wcwidth (wide_char); - if (width > 0) - linepos += width; + /* wcwidth can be expensive on OSX for example, + so avoid if uneeded. */ + if (print_linelength) + { + int width = wcwidth (wide_char); + if (width > 0) + linepos += width; + } if (iswspace (wide_char)) goto mb_word_separator; in_word = true; } + else if (!wide && isprint (to_uchar (*p))) + { + linepos++; + if (isspace (to_uchar (*p))) + goto mb_word_separator; + in_word = true; + } break; } + + p += n; + bytes_read -= n; + chars++; } while (bytes_read > 0); -- 2.9.3