>From 525eb72b150ed34d3bfcfe453d1494fe28a824b7 Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Thu, 14 Feb 2013 15:29:08 -0500 Subject: [PATCH] join: Add -z option * NEWS: Mention join's new option: --zero-terminated (-z). * src/join.c: Add new option, --zero-terminated (-z), to make join use the NUL byte as separator/delimiter rather than newline. (get_line): Use readlinebuffer_delim in place of readlinebuffer. (main): Handle the new option. (usage): Describe new option the same way sort does. * doc/coreutils.texi (join invocation): Describe the new option. * tests/misc/join.pl: add tests for -z option. --- NEWS | 6 ++++++ doc/coreutils.texi | 17 +++++++++++++++++ src/join.c | 19 +++++++++++++++---- tests/misc/join.pl | 20 ++++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 37bcdf7..618c1da 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,12 @@ GNU coreutils NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** New features + + join accepts a new option: --zero-terminated (-z). As with the sort,uniq + option of the same name, this makes join consume and produce NUL-terminated + lines rather than newline-terminated lines. + * Noteworthy changes in release 8.21 (2013-02-14) [stable] diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 2c16dc4..a72d9ce 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -6059,6 +6059,10 @@ available; the sort order can be any order that considers two fields to be equal if and only if the sort comparison described above considers them to be equal. For example: +Input and output lines are terminated with a newline character unless the +@option{--zero-terminated} (@option{-z}) is used, in which case lines are +@sc{nul} terminated. + @example $ cat file1 a a1 @@ -6181,6 +6185,19 @@ character is used to delimit the fields. Print a line for each unpairable line in file @var{file-number} (either @samp{1} or @samp{2}), instead of the normal output. +@item -z +@itemx --zero-terminated +@opindex -z +@opindex --zero-terminated +@cindex join zero-terminated lines +Treat the input as a set of lines, each terminated by a null character +(ASCII @sc{nul}) instead of a line feed +(ASCII @sc{lf}). +This option can be useful in conjunction with @samp{sort -z}, @samp{uniq -z}, +@samp{perl -0} or @samp{find -print0} and @samp{xargs -0} which do the same in +order to reliably handle arbitrary file names (even those containing blanks +or other special characters). + @end table @exitstatus diff --git a/src/join.c b/src/join.c index 11e647c..1810ac2 100644 --- a/src/join.c +++ b/src/join.c @@ -161,6 +161,7 @@ static struct option const longopts[] = {"ignore-case", no_argument, NULL, 'i'}, {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, + {"zero-terminated", no_argument, NULL, 'z'}, {"header", no_argument, NULL, HEADER_LINE_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, @@ -177,6 +178,9 @@ static bool ignore_case; join them without checking for ordering */ static bool join_header_lines; +/* The character marking end of line. Default to \n. */ +static char eolchar = '\n'; + void usage (int status) { @@ -213,6 +217,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ --header treat the first line in each file as field headers,\n\ print them without trying to pair them\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated end lines with 0 byte, not newline\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ @@ -445,7 +452,7 @@ get_line (FILE *fp, struct line **linep, int which) else line = init_linep (linep); - if (! readlinebuffer (&line->buf, fp)) + if (! readlinebuffer_delim (&line->buf, fp, eolchar)) { if (ferror (fp)) error (EXIT_FAILURE, errno, _("read error")); @@ -614,7 +621,7 @@ prjoin (struct line const *line1, struct line const *line2) break; putchar (output_separator); } - putchar ('\n'); + putchar (eolchar); } else { @@ -636,7 +643,7 @@ prjoin (struct line const *line1, struct line const *line2) prfields (line1, join_field_1, autocount_1); prfields (line2, join_field_2, autocount_2); - putchar ('\n'); + putchar (eolchar); } } @@ -1017,7 +1024,7 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; - while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", + while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z", longopts, NULL)) != -1) { @@ -1107,6 +1114,10 @@ main (int argc, char **argv) } break; + case 'z': + eolchar = 0; + break; + case NOCHECK_ORDER_OPTION: check_input_order = CHECK_ORDER_DISABLED; break; diff --git a/tests/misc/join.pl b/tests/misc/join.pl index 9b93794..c467054 100755 --- a/tests/misc/join.pl +++ b/tests/misc/join.pl @@ -275,6 +275,26 @@ my @tv = ( [ "ID1 Name\n1 A\n", ""], "ID1 Name\n1 A\n", 0], +# Zero-terminated lines +['z1', '-z', + ["a\0c\0e\0", "a\0b\0c\0"], "a\0c\0", 0], + +# not zero-terminated, but related to the code change: +# the old readlinebuffer() auto-added '\n' to the last line. +# the new readlinebuffer_delim() does not. +# Ensure it doesn't matter. +['z2', '', + ["a\nc\ne\n", "a\nb\nc"], "a\nc\n", 0], +['z3', '', + ["a\nc\ne", "a\nb\nc"], "a\nc\n", 0], +# missing last NUL at the end of the last line (=end of file) +['z4', '-z', + ["a\0c\0e", "a\0b\0c"], "a\0c\0", 0], +# edge-case: the embedded newlines should treated as +# be part of the nul-terminated line +['z5', '-z -a1 -a2', + ["a\n1\0c 3\0","b\n8\0c 9\0"], "a\n1\0b\n8\0c 3 9\0"], + ); # Convert the above old-style test vectors to the newer -- 1.7.7.4