From 907824115bc076e0c60b0d9a955b1d1bc830c59b Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Tue, 5 Jul 2016 07:03:30 -0700 Subject: [PATCH] . --- bootstrap.conf | 1 + lib/.gitignore | 6 ++ m4/.gitignore | 3 + po/POTFILES.in | 1 + testsuite/init.cfg | 8 +-- testsuite/test-mbrtowc.c | 171 +++++++++++++++++++++++------------------------ 6 files changed, 100 insertions(+), 90 deletions(-) diff --git a/bootstrap.conf b/bootstrap.conf index 7744230..f53fe2f 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -25,6 +25,7 @@ acl alloca btowc c-ctype +closeout extensions fwriting getdelim diff --git a/lib/.gitignore b/lib/.gitignore index 303403b..7e85de9 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -210,3 +210,9 @@ xstrndup.h /progname.h /ignore-value.h /alignof.h +/close-stream.c +/close-stream.h +/closeout.c +/closeout.h +/fpending.c +/fpending.h diff --git a/m4/.gitignore b/m4/.gitignore index ac2c250..7ac433e 100644 --- a/m4/.gitignore +++ b/m4/.gitignore @@ -181,3 +181,6 @@ xstrndup.m4 /warnings.m4 /obstack.m4 /stdalign.m4 +/close-stream.m4 +/closeout.m4 +/fpending.m4 diff --git a/po/POTFILES.in b/po/POTFILES.in index 637bb7a..db55651 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -1,3 +1,4 @@ +lib/closeout.c lib/copy-acl.c lib/error.c lib/getopt.c diff --git a/testsuite/init.cfg b/testsuite/init.cfg index 64e5df2..9eacda4 100644 --- a/testsuite/init.cfg +++ b/testsuite/init.cfg @@ -72,25 +72,25 @@ require_ja_shiftjis_locale_() } # Ensure the implementation of mbrtowc can detect invalid -# multibyte shiftjis sequences. Otherwise, skip the test to avoid +# multibyte shiftjis sequences. Otherwise, skip the test, to avoid # false-alarms. # "$1" should be the name of the SHIFT-JIS locale # (as set by 'require_ja_shiftjis_locale_' above) require_valid_ja_shiftjis_locale_() { path_prepend_ . - n=$(printf '\203:' | LC_ALL="$1" test-mbrtowc) + local n=$(printf '\203:' | LC_ALL="$1" test-mbrtowc) test "x$n" = "x-2,-1" || skip_ "locale '$1' is buggy" } # Ensure the implementation of mbrtowc can detect invalid -# multibyte eucJP sequences. Otherwise, skip the test to avoid +# multibyte eucJP sequences. Otherwise, skip the test, to avoid # false-alarms. # "$1" should be the name of the ja_JP.eucJP locale # (as set in $LOCALE_JA by m4/locale-ja.m4) require_valid_ja_eucjp_locale_() { path_prepend_ . - n=$(printf '\262C' | LC_ALL="$1" test-mbrtowc) + local n=$(printf '\262C' | LC_ALL="$1" test-mbrtowc) test "x$n" = "x-2,-1" || skip_ "locale '$1' is buggy" } diff --git a/testsuite/test-mbrtowc.c b/testsuite/test-mbrtowc.c index 1a7a7bb..04c20fc 100644 --- a/testsuite/test-mbrtowc.c +++ b/testsuite/test-mbrtowc.c @@ -16,121 +16,123 @@ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ -/* -Test the operating-system's native mbrtowc(3) function, -by feeding it multibyte seqeunces one byte at a time, -and reporting the result. +/* Test the operating-system's native mbrtowc(3) function, + by feeding it multibyte seqeunces one byte at a time, + and reporting the result. -The program prints the following values after each mbrtowc invocation, -separated by commas: - -2 the octet is contributes to a valid yet incomplete multibyte seqeunce - in the current locale. + The program prints the following values after each mbrtowc invocation, + separated by commas: - -1 the octet causes an encoding error. + -2 the octet is contributes to a valid yet incomplete multibyte sequence + in the current locale. - 0 the octet represents a NUL byte + -1 the octet causes an encoding error. - 1 the octet is a valid single-byte character, OR - completes a valid multibyte sequence. + 0 the octet represents a NUL byte -Because the program invokes mbrtowc(3) byte-by-byte, the reported -result should never be larger than 1. + 1 the octet is a valid single-byte character, OR + completes a valid multibyte sequence. -Example of typical output with UTF-8 encoding ---------------------------------------------- + Because the program invokes mbrtowc(3) byte-by-byte, the reported + result should never be larger than 1. -The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as: - hex: 0xE2 0x88 0x91 - oct: 342 210 211 + Example of typical output with UTF-8 encoding + --------------------------------------------- -Decoding the valid sequence byte-by-byte gives: - $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc - -2,-2,1 + The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as: + hex: 0xE2 0x88 0x91 + oct: 342 210 211 -Value '\210' is not a valid leading byte in UTF-8, -thus the first byte gives -1, and the 'X' is treated -as a valid single-byte character: + Decoding the valid sequence byte-by-byte gives: + $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc + -2,-2,1 - $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc - -1,1 + '\210' is not a valid leading byte in UTF-8, + thus the first byte gives -1, and the 'X' is treated + as a valid single-byte character: -Value '\342' is valid yet incomplete multibyte sequence. -Passing it to mbrtowc results in value '-2'. -The following value 'X' gives an encoding error '-1' -(as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence): + $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc + -1,1 - $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc - -2,-1 + '\342' is a valid yet incomplete multibyte sequence. + Passing it to mbrtowc results in value '-2'. + The following value 'X' gives an encoding error '-1' + (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence): + $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc + -2,-1 -Detecting implementation bugs in mbrtowc ----------------------------------------- -UTF-8 implementation is correct on most operating systems. -Other multibyte locales might present more difficulties. -An example is the Japanese SHIFT-JIS locale under Mac OS X. -NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis' -under Ubuntu. 'ja_JP.sjis' was also found on some systems. + Detecting implementation bugs in mbrtowc + ---------------------------------------- -Using unicode character 'KATAKANA LETTER ZE' (U+30BC) - UTF-8: hex: 0xE3 0x82 0xBC - Shift-jis hex: 0x83 0x5B - oct: 203 133 + UTF-8 implementation is correct on most operating systems. + Other multibyte locales might present more difficulties. + An example is the Japanese SHIFT-JIS locale under Mac OS X. + NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis' + under Ubuntu. 'ja_JP.sjis' was also found on some systems. -The following is a valid multibyte sequence in SHIFT-JIS, -the first byte should result in '-2' (valid yet incomplete), -and the second byte shoudl result in '1' (a valid multibyte sequence -completed): + Using unicode character 'KATAKANA LETTER ZE' (U+30BC) + UTF-8: hex: 0xE3 0x82 0xBC + Shift-jis hex: 0x83 0x5B + oct: 203 133 - $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc - -2,1 + The following is a valid multibyte sequence in SHIFT-JIS, + the first byte should result in '-2' (valid yet incomplete), + and the second byte should result in '1' (a valid multibyte sequence + completed): -The follwing is an INVALID multibyte sequence in SHIFT-JIS -(The character ':' is not valid appear as a second octet). -Buggy implementations will accept this as a valid multibyte sequence: + $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,1 - # NOTE: this result indicates a buggy mbrtowc - $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc - -2,1 + The follwing is an INVALID multibyte sequence in SHIFT-JIS + (The byte ':' is not valid as a second octet). + Buggy implementations will accept this as a valid multibyte sequence: -A correct implementations should report '-1' for the second byte (i.e. -an encoding error): + # NOTE: this result indicates a buggy mbrtowc + $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,1 - $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc - -2,-1 + A correct implementations should report '-1' for the second byte (i.e. + an encoding error): + $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,-1 -Expected results with correct implementations ---------------------------------------------- -In GNU Sed some tests purposely use invalid multibyte sequences -to test sed's behaviour. A buggy implemetation of mbrtowc -will result in false-alarm failures. + Expected results with correct implementations + --------------------------------------------- -The following are expected results in correct implementations: -(locale names are from Mac OS X): + In GNU Sed some tests purposely use invalid multibyte sequences + to test sed's behaviour. A buggy implemetation of mbrtowc + would result in false-alarm failures. - $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc - -2,1 - $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc - -2,-1 - $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc - -2,-1 + The following are expected results in correct implementations: + (locale names are from Mac OS X): + + $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,1 + $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,-1 + $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc + -2,-1 +*/ - */ #include #include #include #include #include +#include "closeout.h" +#include "error.h" #include "progname.h" /* stub replacement for non-standard err(3) */ static int -die (const char* msg) +die (const char *msg) { - fprintf (stderr, "%s: error: %s\n", program_name, msg); + error (0, 0, "%s: error: %s\n", program_name, msg); exit (EXIT_FAILURE); } @@ -138,25 +140,23 @@ int main (int argc, char **argv) { int c; - char ch; - wchar_t wc; - int i; int first = 1; set_program_name (argv[0]); if (!setlocale (LC_ALL, "")) die ("failed to set locale"); - while ( (c=getchar ()) != EOF ) + while ((c = getchar ()) != EOF) { - ch = (unsigned char)c; - i = (int)mbrtowc (&wc, &ch, 1, NULL); + wchar_t wc; + char ch = (unsigned char) c; + int i = (int) mbrtowc (&wc, &ch, 1, NULL); if (!first) putchar (','); first = 0; - printf ("%d",i); + printf ("%d", i); } if (first) @@ -164,10 +164,9 @@ main (int argc, char **argv) putchar ('\n'); - if (ferror(stdin)) + if (ferror (stdin)) die ("read error"); - if (ferror(stdout)) - die ("write error"); + close_stdout (); exit (EXIT_SUCCESS); } -- 2.8.0-rc2