>From 49f43214ebfa41fa1f67e7001d8467288ff34837 Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Wed, 6 Mar 2013 15:53:16 -0500 Subject: [PATCH] csplit: new option, --suppress-matched FIXME: Currently works only with REGEXP patterns. With --suppress-matched, the lines that match the pattern will not be printed in the output files. * src/csplit.c: implement --suppress-matched. process_regexp(),process_line_count(): skip the matched lined without printing. Since csplit always does "up to but not including" matched lines, the first line (in the next group) is the matched line - just skip it. main(): handle new option. usage(): mention new option. * NEWS: mention new option. * doc/coreutils.texi: mention new option, add examples. * tests/misc/csplit-supress-matched.sh: test new option. * tests/local.mk: add new test script. --- NEWS | 3 + doc/coreutils.texi | 25 ++++ src/csplit.c | 26 ++++- tests/local.mk | 1 + tests/misc/csplit-suppress-matched.sh | 233 +++++++++++++++++++++++++++++++++ 5 files changed, 287 insertions(+), 1 deletions(-) create mode 100755 tests/misc/csplit-suppress-matched.sh diff --git a/NEWS b/NEWS index 5b28c92..2385be7 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,9 @@ GNU coreutils NEWS -*- outline -*- uniq accepts a new option: --group to print all items, while separating unique groups with empty lines. + csplit accepts a new option: --suppressed-matched (-m). Lines matching + the specified patterns will not be printed. + * Noteworthy changes in release 8.21 (2013-02-14) [stable] diff --git a/doc/coreutils.texi b/doc/coreutils.texi index fe4c3ad..4f7da4c 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3608,6 +3608,12 @@ long instead of the default 2. @opindex --keep-files Do not remove output files when errors are encountered. +@item -m +@itemx --suppress-matched +@opindex -m +@opindex --suppress-matched +Do not output lines matching the specified @var{pattern}. + @item -z @itemx --elide-empty-files @opindex -z @@ -3684,6 +3690,25 @@ $ head xx* 14 @end example +Example of splitting input by empty lines: + +@example +$ csplit --suppress-matched @var{input.txt} '/^$/' '@{*@}' +@end example + +@c +@c TODO: "uniq" already supportes "--group". +@c when it gets the "--key" option, uncomment this example. +@c +@c Example of splitting input file, based on the value of column 2: +@c +@c @example +@c $ cat @var{input.txt} | +@c sort -k2,2 | +@c uniq --group -k2,2 | +@c csplit -m '/^$/' '@{*@}' +@c @end example + @node Summarizing files @chapter Summarizing files diff --git a/src/csplit.c b/src/csplit.c index 22f3ad4..664b567 100644 --- a/src/csplit.c +++ b/src/csplit.c @@ -166,6 +166,9 @@ static bool volatile remove_files; /* If true, remove all output files which have a zero length. */ static bool elide_empty_files; +/* If true, supress the lines that match the PATTERN */ +static bool suppress_matched; + /* The compiled pattern arguments, which determine how to split the input file. */ static struct control *controls; @@ -185,6 +188,7 @@ static struct option const longopts[] = {"elide-empty-files", no_argument, NULL, 'z'}, {"prefix", required_argument, NULL, 'f'}, {"suffix-format", required_argument, NULL, 'b'}, + {"suppress-matched", no_argument, NULL, 'm'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -721,6 +725,15 @@ process_line_count (const struct control *p, uintmax_t repetition) create_output_file (); +#if 0 + /* FIXME: this doesn't work when the last line is the matched line + * e.g.: + * $ seq 1 6 | ./src/csplit -m - 2 4 6 + */ + if (suppress_matched) + line = remove_line (); +#endif + linenum = get_first_line_in_buffer (); while (linenum++ < last_line_to_save) @@ -778,6 +791,9 @@ process_regexp (struct control *p, uintmax_t repetition) if (!ignore) create_output_file (); + if (suppress_matched && current_line > 0) + line = remove_line (); + /* If there is no offset for the regular expression, or it is positive, then it is not necessary to buffer the lines. */ @@ -1324,9 +1340,10 @@ main (int argc, char **argv) control_used = 0; suppress_count = false; remove_files = true; + suppress_matched = false; prefix = DEFAULT_PREFIX; - while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1) + while ((optc = getopt_long (argc, argv, "f:b:kmn:sqz", longopts, NULL)) != -1) switch (optc) { case 'f': @@ -1341,6 +1358,10 @@ main (int argc, char **argv) remove_files = false; break; + case 'm': + suppress_matched = true; + break; + case 'n': if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK || MIN (INT_MAX, SIZE_MAX) < val) @@ -1465,6 +1486,9 @@ and output byte counts of each piece to standard output.\n\ -k, --keep-files do not remove output files on errors\n\ "), stdout); fputs (_("\ + -m, --suppress-matched suppress the lines matching PATTERN\n\ +"), stdout); + fputs (_("\ -n, --digits=DIGITS use specified number of digits instead of 2\n\ -s, --quiet, --silent do not print counts of output file sizes\n\ -z, --elide-empty-files remove empty output files\n\ diff --git a/tests/local.mk b/tests/local.mk index 607ddc4..fc53c75 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -260,6 +260,7 @@ all_tests = \ tests/misc/csplit.sh \ tests/misc/csplit-1000.sh \ tests/misc/csplit-heap.sh \ + tests/misc/csplit-suppress-matched.sh \ tests/misc/date-sec.sh \ tests/misc/dircolors.pl \ tests/misc/dirname.pl \ diff --git a/tests/misc/csplit-suppress-matched.sh b/tests/misc/csplit-suppress-matched.sh new file mode 100755 index 0000000..070284a --- /dev/null +++ b/tests/misc/csplit-suppress-matched.sh @@ -0,0 +1,233 @@ +#!/bin/sh +# Test csplit's --suppress-matched option + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ csplit + +printf "%s\n" a a YY '' XX b b YY '' \ + XX c YY '' XX d d d > in1 || framework_failure_ + +# Expected output of test 1: +# the newline (matched line) appears in the output file (exp2/3/4) +printf "a\na\nYY\n" > test1_exp0 || framework_failure_ +printf "\nXX\nb\nb\nYY\n" > test1_exp1 || framework_failure_ +printf "\nXX\nc\nYY\n" > test1_exp2 || framework_failure_ +printf "\nXX\nd\nd\nd\n" > test1_exp3 || framework_failure_ + +# Expected output of test 2: +# the newline (matched line) does not appears in the output files +printf "a\na\nYY\n" > test2_exp0 || framework_failure_ +printf "XX\nb\nb\nYY\n" > test2_exp1 || framework_failure_ +printf "XX\nc\nYY\n" > test2_exp2 || framework_failure_ +printf "XX\nd\nd\nd\n" > test2_exp3 || framework_failure_ + +# Expected output of test 3: +# the XX (matched line + offset 1) does not appears in the output files. +# the newline appears in the files (before each split, at the end of the file) +printf "a\na\nYY\n\n" > test3_exp0 || framework_failure_ +printf "b\nb\nYY\n\n" > test3_exp1 || framework_failure_ +printf "c\nYY\n\n" > test3_exp2 || framework_failure_ +printf "d\nd\nd\n" > test3_exp3 || framework_failure_ + +# Expected output of test 4: +# the YY (matched line + offset of -1) does not appears in the output files +# the newline appears in the files (as the first line of the new split) +printf "a\na\n" > test4_exp0 || framework_failure_ +printf "\nXX\nb\nb\n" > test4_exp1 || framework_failure_ +printf "\nXX\nc\n" > test4_exp2 || framework_failure_ +printf "\nXX\nd\nd\nd\n" > test4_exp3 || framework_failure_ + +seq 1 6 > in2 || framework_failure_ + +# Expected output of test 5 +# The matched lines (2/4/6) appear as the first line of new files. +printf "1\n" > test5_exp0 || framework_failure_ +printf "2\n3\n" > test5_exp1 || framework_failure_ +printf "4\n5\n" > test5_exp2 || framework_failure_ +printf "6\n" > test5_exp3 || framework_failure_ + +# Expected output of test 6 +# The matched lines (2/4/6) are not present +printf "1\n" > test6_exp0 || framework_failure_ +printf "3\n" > test6_exp1 || framework_failure_ +printf "5\n" > test6_exp2 || framework_failure_ + + +# Test two consecutive matched lines +printf "%s\n" a '' '' b > in3 || framework_failure_ + +# Expected output of test 7: +# suppress-matched will cause the second group to be an empty file. +# (without --suppress-matched it should contain a single newline) +printf "a\n" > test7_exp0 || framework_failure_ +printf "" > test7_exp1 || framework_failure_ +printf "b\n" > test7_exp2 || framework_failure_ + +# Expected output of test 8: +# suppress-matched + elide-empty-files +# should create just two files +printf "a\n" > test8_exp0 || framework_failure_ +printf "b\n" > test8_exp1 || framework_failure_ + + +# A matched-line as the last line +printf "%s\n" a '' b '' > in4 || framework_failure_ + +# Expected output of test 9: +# suppress-matched should create just three files +# (as the last line which matched should be suppressed, but still start a +# new file) +printf "a\n" > test9_exp0 || framework_failure_ +printf "b\n" > test9_exp1 || framework_failure_ +printf "" > test9_exp2 || framework_failure_ + +# Expected output of test 10: +# suppress-matched + elide-empty-files should create just two files +printf "a\n" > test10_exp0 || framework_failure_ +printf "b\n" > test10_exp1 || framework_failure_ + + + +## +## Test 1: +## regexp baseline without --suppress-matched +## +csplit --prefix=t1_ in1 '/^$/' '{*}' > /dev/null || fail=1 +for i in 0 1 2 3 ; +do + compare test1_exp$i t1_0$i || { fail=1 ; echo "test1_exp$i failed" 1>&2 ; } +done + +## +## Test 2: +## suppress-matched + regexp +## +csplit --prefix=t2_ --suppress-matched \ + in1 '/^$/' '{*}' > /dev/null || fail=1 +for i in 0 1 2 3 ; +do + compare test2_exp$i t2_0$i || { fail=1 ; echo "test2_exp$i failed" 1>&2 ; } +done + +## +## Test 3: +## suppress-matched + regexp + offset=1 +## +csplit --prefix=t3_ --suppress-matched \ + in1 '/^$/1' '{*}' > /dev/null || fail=1 +for i in 0 1 2 3 ; +do + compare test3_exp$i t3_0$i || { fail=1 ; echo "test3_exp$i failed" 1>&2 ; } +done + +## +## Test 4: +## suppress-matched + regexp + offset=-1 +## +csplit --prefix=t4_ --suppress-matched \ + in1 '/^$/-1' '{*}' > /dev/null || fail=1 +for i in 0 1 2 3 ; +do + compare test4_exp$i t4_0$i || { fail=1 ; echo "test4_exp$i failed" 1>&2 ; } +done + +## +## Test 5: +## INTEGER baseline without --suppress-matched +## +csplit --prefix=t5_ in2 2 4 6 > /dev/null || fail=1 +for i in 0 1 2 3 ; +do + compare test5_exp$i t5_0$i || { fail=1 ; echo "test5_exp$i failed" 1>&2 ; } +done + +## +## TODO: FIX BUG when last line is the matched line +## +if false ; then + ## + ## Test 6: + ## INTEGER with --suppress-matched + ## + csplit --suppress-matched --prefix=t6_ in2 2 4 6 > /dev/null || fail=1 + for i in 0 1 2 ; + do + compare test6_exp$i t6_0$i || { fail=1 ; echo "test6_exp$i failed" 1>&2 ; } + done + #Extra check: the last file (containing only "6") should not be created at all + test -e t6_03 && { fail=1 ; echo "test6_exp3 - failed" 1>&2 ; } +fi + + +## +## Test 7: +## suppress-matched + two consecutive groups +## +csplit --prefix=t7_ --suppress-matched \ + in3 '/^$/' '{*}' > /dev/null || fail=1 +for i in 0 1 2 ; +do + compare test7_exp$i t7_0$i || { fail=1 ; echo "test7_exp$i failed" 1>&2 ; } +done + +## +## Test 8: +## suppress-matched + two consecutive groups + elide-empty-files +## +csplit --prefix=t8_ --elide-empty-files --suppress-matched \ + in3 '/^$/' '{*}' > /dev/null || fail=1 +for i in 0 1 ; +do + compare test8_exp$i t8_0$i || { fail=1 ; echo "test8_exp$i failed" 1>&2 ; } +done +#Extra check: there should not be a third file +test -e t8_02 && { fail=1 ; echo "test8_exp2 - failed" 1>&2 ; } + + +## +## Test 9: +## suppress-matched + matched-line as last line +## +csplit --prefix=t9_ --suppress-matched \ + in4 '/^$/' '{*}' > /dev/null || fail=1 +for i in 0 1 2 ; +do + compare test9_exp$i t9_0$i || { fail=1 ; echo "test9_exp$i failed" 1>&2 ; } +done + +## +## Test 10: +## suppress-matched + matched last line + elide-empty-files +## +csplit --prefix=t10_ --elide-empty-files --suppress-matched \ + in4 '/^$/' '{*}' > /dev/null || fail=1 +for i in 0 1 ; +do + compare test10_exp$i t10_0$i || { fail=1 ; echo "test10_exp$i failed" 1>&2 ; } +done +#Extra check: there should not be a third file +test -e t10_02 && { fail=1 ; echo "test10_exp2 - failed" 1>&2 ; } + + + + + + + + +Exit $fail -- 1.7.7.4