From d7ca694819f0f6be59807d441c35a047b1852c6d Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Wed, 7 Jan 2015 18:30:28 -0500 Subject: [PATCH] split: new -t option to select record separator * src/split.c (eolchar): A new variable to hold the separator character (unibyte for now). This is reference throughout rather than hardcoding '\n'. (usage): Describe the new --separator option, and mention records along with lines so there is no ambiguity that all options treat lines and records equivalently. (main): Have -t update eolchar, or default to '\n'. * tests/split/lines-sep.sh: New test case. * tests/local.mk: Reference the new test. * doc/coreutils.texi (split invocation): Document the new option. * NEWS: Mention the new feature. --- NEWS | 3 ++ doc/coreutils.texi | 11 +++++++ src/split.c | 68 +++++++++++++++++++++++++++++++---------- tests/local.mk | 5 ++-- tests/split/lines-sep.sh | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 148 insertions(+), 17 deletions(-) create mode 100755 tests/split/lines-sep.sh diff --git a/NEWS b/NEWS index f59bfc1..e0a2893 100644 --- a/NEWS +++ b/NEWS @@ -45,6 +45,9 @@ GNU coreutils NEWS -*- outline -*- dd accepts a new status=progress level to print data transfer statistics on stderr approximately every second. + split accepts a new --separator option to select a record separator character + other than the default newline character. + ** Changes in behavior df no longer suppresses separate exports of the same remote device, as diff --git a/doc/coreutils.texi b/doc/coreutils.texi index f6aef2d..86007a9 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3505,6 +3505,17 @@ than the number requested, or if a line is so long as to completely span a chunk. The output file sequence numbers, always run consecutively even when this option is specified. +@item -t @var{separator} +@itemx --separator=@var{separator} +@opindex -t +@opindex --separator +@cindex line separator character +@cindex record separator character +Use character @var{separator} as the record separator instead of the default +newline character (ASCII LF). +To specify ASCII NUL as the separator, use the two-character string @samp{\0}, +e.g., @samp{split -t '\0'}. + @item -u @itemx --unbuffered @opindex -u diff --git a/src/split.c b/src/split.c index ef672f4..d17616c 100644 --- a/src/split.c +++ b/src/split.c @@ -16,10 +16,9 @@ /* By address@hidden, with rms. - To do: - * Implement -t CHAR or -t REGEX to specify break characters other - than newline. */ - + TODO: + * support -p REGEX as in BSD's split. + * support --suppress-matched as in csplit. */ #include #include @@ -108,6 +107,9 @@ static bool elide_empty_files; input to output, which is much slower, so disabled by default. */ static bool unbuffered; +/* The character marking end of line. Defaults to \n below. */ +static int eolchar = -1; + /* The split mode to use. */ enum Split_type { @@ -139,6 +141,7 @@ static struct option const longopts[] = {"numeric-suffixes", optional_argument, NULL, 'd'}, {"filter", required_argument, NULL, FILTER_OPTION}, {"verbose", no_argument, NULL, VERBOSE_OPTION}, + {"separator", required_argument, NULL, 't'}, {"-io-blksize", required_argument, NULL, IO_BLKSIZE_OPTION}, /* do not document */ {GETOPT_HELP_OPTION_DECL}, @@ -216,13 +219,15 @@ is -, read standard input.\n\ -a, --suffix-length=N generate suffixes of length N (default %d)\n\ --additional-suffix=SUFFIX append an additional SUFFIX to file names\n\ -b, --bytes=SIZE put SIZE bytes per output file\n\ - -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\ + -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\ -d, --numeric-suffixes[=FROM] use numeric suffixes instead of alphabetic;\n\ FROM changes the start value (default 0)\n\ -e, --elide-empty-files do not generate empty output files with '-n'\n\ --filter=COMMAND write to shell COMMAND; file name is $FILE\n\ - -l, --lines=NUMBER put NUMBER lines per output file\n\ + -l, --lines=NUMBER put NUMBER lines/records per output file\n\ -n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\ + -t, --separator=SEP use SEP instead of newline as the record separator;\n\ + '\\0' (zero) specifies the NUL character\n\ -u, --unbuffered immediately copy input to output with '-n r/...'\n\ "), DEFAULT_SUFFIX_LENGTH); fputs (_("\ @@ -236,8 +241,8 @@ is -, read standard input.\n\ CHUNKS may be:\n\ N split into N files based on size of input\n\ K/N output Kth of N to stdout\n\ - l/N split into N files without splitting lines\n\ - l/K/N output Kth of N to stdout without splitting lines\n\ + l/N split into N files without splitting lines/records\n\ + l/K/N output Kth of N to stdout without splitting lines/records\n\ r/N like 'l' but use round robin distribution\n\ r/K/N likewise but only output Kth of N to stdout\n\ "), stdout); @@ -630,10 +635,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) error (EXIT_FAILURE, errno, "%s", infile); bp = bp_out = buf; eob = bp + n_read; - *eob = '\n'; + *eob = eolchar; while (true) { - bp = memchr (bp, '\n', eob - bp + 1); + bp = memchr (bp, eolchar, eob - bp + 1); if (bp == eob) { if (eob != bp_out) /* do not write 0 bytes! */ @@ -692,10 +697,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) /* Have enough for split. */ split_rest = n_bytes - n_out - n_hold; eoc = sob + split_rest - 1; - eol = memrchr (sob, '\n', split_rest); + eol = memrchr (sob, eolchar, split_rest); } else - eol = memrchr (sob, '\n', n_left); + eol = memrchr (sob, eolchar, n_left); /* Output hold space if possible. */ if (n_hold && !(!eol && n_out)) @@ -833,7 +838,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, /* Begin looking for '\n' at last byte of chunk. */ off_t skip = MIN (n_read, MAX (0, chunk_end - n_written)); - char *bp_out = memchr (bp + skip, '\n', n_read - skip); + char *bp_out = memchr (bp + skip, eolchar, n_read - skip); if (bp_out++) next = true; else @@ -1080,7 +1085,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize) bool next = false; /* Find end of line. */ - char *bp_out = memchr (bp, '\n', eob - bp); + char *bp_out = memchr (bp, eolchar, eob - bp); if (bp_out) { bp_out++; @@ -1224,7 +1229,7 @@ main (int argc, char **argv) int this_optind = optind ? optind : 1; char *slash; - c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u", + c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:u", longopts, NULL); if (c == -1) break; @@ -1303,6 +1308,36 @@ main (int argc, char **argv) unbuffered = true; break; + case 't': + { + char neweol = optarg[0]; + if (! neweol) + error (EXIT_FAILURE, 0, _("empty record separator")); + if (optarg[1]) + { + if (STREQ (optarg, "\\0")) + neweol = '\0'; + else + { + /* Provoke with 'split -txx'. Complain about + "multi-character tab" instead of "multibyte tab", so + that the diagnostic's wording does not need to be + changed once multibyte characters are supported. */ + error (EXIT_FAILURE, 0, _("multi-character separator %s"), + quote (optarg)); + } + } + /* Make it explicit we don't support multiple separators. */ + if (0 <= eolchar && neweol != eolchar) + { + error (EXIT_FAILURE, 0, + _("multiple separator characters specified")); + } + + eolchar = neweol; + } + break; + case '0': case '1': case '2': @@ -1398,6 +1433,9 @@ main (int argc, char **argv) usage (EXIT_FAILURE); } + if (eolchar < 0) + eolchar = '\n'; + set_suffix_length (n_units, split_type); /* Get out the filename arguments. */ diff --git a/tests/local.mk b/tests/local.mk index 6fc8599..54e9e73 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -355,6 +355,7 @@ all_tests = \ tests/split/b-chunk.sh \ tests/split/fail.sh \ tests/split/lines.sh \ + tests/split/lines-sep.sh \ tests/split/line-bytes.sh \ tests/split/l-chunk.sh \ tests/split/r-chunk.sh \ @@ -402,7 +403,7 @@ all_tests = \ tests/misc/xattr.sh \ tests/tail-2/wait.sh \ tests/tail-2/retry.sh \ - tests/tail-2/symlink.sh \ + tests/tail-2/symlink.sh \ tests/tail-2/tail-c.sh \ tests/chmod/c-option.sh \ tests/chmod/equal-x.sh \ @@ -483,7 +484,7 @@ all_tests = \ tests/dd/ascii.sh \ tests/dd/direct.sh \ tests/dd/misc.sh \ - tests/dd/no-allocate.sh \ + tests/dd/no-allocate.sh \ tests/dd/nocache.sh \ tests/dd/not-rewound.sh \ tests/dd/reblock.sh \ diff --git a/tests/split/lines-sep.sh b/tests/split/lines-sep.sh new file mode 100755 index 0000000..234e519 --- /dev/null +++ b/tests/split/lines-sep.sh @@ -0,0 +1,78 @@ +#!/bin/sh +# test split with custom record separators + +# Copyright (C) 2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ split + +NL=' +' + +for sep in "$NL" '\0' ':'; do + + test "$sep" = "$NL" && tr='\n' || tr="$sep" + + for mode in '--lines=2' '--line-bytes=4' '--number=l/3' '--number=r/3'; do + + # Generate in default mode for comparison + printf '1\n2\n3\n4\n5\n' > in || framework_failure_ + split $mode in || return 1 + tr '\n' "$tr" < xaa > exp1 + tr '\n' "$tr" < xab > exp2 + tr '\n' "$tr" < xac > exp3 + + rm -f x?? + + # Generate output with specified --separator + printf '1\n2\n3\n4\n5\n' | tr '\n' "$tr" > in || framework_failure_ + split $mode -t "$sep" in || return 1 + + compare exp1 xaa || return 1 + compare exp2 xab || return 1 + compare exp3 xac || return 1 + test -f xad && return 1 + done + +done + + +# +# Test usage edge cases +# + +# Should fail: '-t' requires an argument +{ split -t /dev/null 2>/dev/null || test $? -ne 1; } && + { warn_ "-t without argument did not trigger an error" ; fail=1 ; } + +# should fail: multi-character separator +{ split -txx /dev/null 2>&1 || test $? -ne 1; } && + { warn_ "-txx did not trigger an error" ; fail=1 ; } + +# should fail: different separators used +{ split -ta -tb /dev/null 2>&1 || test $? -ne 1; } && + { warn_ "-ta -tb did not trigger an error" ; fail=1 ; } + +# should fail: different separators used, including default +{ split -t"$NL" -tb /dev/null 2>&1 || test $? -ne 1; } && + { warn_ "-t\$NL -tb did not trigger an error" ; fail=1 ; } + +# should not fail: same separator used multiple times +split -t: -t: /dev/null 2>&1 || + { warn_ "-t: -t: triggered an error" ; fail=1 ; } + + +Exit $fail -- 2.1.0