diff --git a/doc/sed.texi b/doc/sed.texi index e00eb36..2cfae22 100644 --- a/doc/sed.texi +++ b/doc/sed.texi @@ -3391,8 +3391,8 @@ $ printf 'a\u03A3b' | od -tx1 -An @codequotebacktick off @noindent -To force @command{sed} to process octets separately, use @code{C} locale -(also known as @code{POSIX} locale): +To force @command{sed} to process octets separately, use the @code{C} locale +(also known as the @code{POSIX} locale): @codequoteundirected on @codequotebacktick on @@ -3405,7 +3405,7 @@ XXXX @subsection Invalid multibyte characters address@hidden's regular expressions @emph{will not} match address@hidden's regular expressions @emph{do not} match invalid multibyte sequences in a multibyte locale. @noindent @@ -3431,7 +3431,7 @@ $ printf 'a\xCEc\n' | sed 's/./X/g' | od -tx1c -An @codequoteundirected off @codequotebacktick off address@hidden Similarly, the 'catch-all' regular expression @samp{.*} will not address@hidden Similarly, the 'catch-all' regular expression @samp{.*} does not match the entire line: @codequoteundirected on @@ -3447,9 +3447,9 @@ $ printf 'a\xCEc\n' | sed 's/.*//' | od -tx1c -An @codequotebacktick off @noindent address@hidden offers the special @command{z} which can clear the address@hidden offers the special @command{z} command to clear the current pattern space regardless of invalid multibyte characters -(i.e. it works like @code{s/.*//} but will also remove invalid multibyte +(i.e. it works like @code{s/.*//} but also removes invalid multibyte characters): @codequoteundirected on @@ -3485,16 +3485,16 @@ $ printf 'a\xCEc\n' | LC_ALL=C sed 's/.*//' | od -tx1c -An can be used to detect such invalid sequences in a file. In the following examples, the @code{\xCE\xCE} is an invalid multibyte sequence, while @code{\xCE\A3} is a valid multibyte sequence -(of the Greeg Sigma character). +(of the Greek Sigma character). @noindent -The following @command{sed} program replaces removes all valid +The following @command{sed} program removes all valid characters using @code{s/.//g}. Any content left in the pattern space (the invalid characters) are added to the hold space using the @code{H} command. On the last line (@code{$}), the hold space is retrieved (@code{x}), newlines are removed (@code{s/\n//g}), and any remaining octets are printed unambiguously (@code{l}). Thus, any invalid -multibyte sequences will be printed as octal values: +multibyte sequences are printed as octal values: @codequoteundirected on @codequotebacktick on @@ -3515,9 +3515,9 @@ $ sed -n 's/.//g ; H ; address@hidden;s/\n//g;address@hidden' invalid.txt @codequoteundirected off @codequotebacktick off address@hidden With few more commands, @command{sed} can print -the exact line number which contains the invalid characters (line 3). -These characters can then be removed by forcing @code{C} locale address@hidden With a few more commands, @command{sed} can print +the exact line number corresponding to each invalid characters (line 3). +These characters can then be removed by forcing the @code{C} locale and using octal escape sequences: @codequoteundirected on