>From eec5cf679824ed67c8b751ecb90565a22fc51719 Mon Sep 17 00:00:00 2001 From: Assaf Gordon
Date: Wed, 6 Mar 2013 15:53:16 -0500 Subject: [PATCH] csplit: new option --suppress-matched With --suppress-matched, the lines that match the pattern will not be printed in the output files. * src/csplit.c: implement --suppress-matched. process_regexp(),process_line_count(): skip the matched lined without printing. Since csplit always does "up to but not including" matched lines, the first line (in the next group) is the matched line - just skip it. main(): handle new option. usage(): mention new option. * NEWS: mention new option. * doc/coreutils.texi: mention new option, add examples. * tests/misc/csplit-supress-matched.pl: test new option. * tests/local.mk: add new test script. --- NEWS | 3 + doc/coreutils.texi | 25 ++++ src/csplit.c | 29 ++++- tests/local.mk | 1 + tests/misc/csplit-suppress-matched.pl | 213 +++++++++++++++++++++++++++++++++ 5 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 tests/misc/csplit-suppress-matched.pl diff --git a/NEWS b/NEWS index 0c2daad..896512d 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,9 @@ GNU coreutils NEWS -*- outline -*- uniq accepts a new option: --group to print all items, while separating unique groups with empty lines. + csplit accepts a new option: --suppressed-matched (-m). Lines matching + the specified patterns will not be printed. + ** Improvements stat and tail work better with EFIVARFS, EXOFS, F2FS and UBIFS. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index dfa9b1c..7dfe724 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3607,6 +3607,12 @@ long instead of the default 2. @opindex --keep-files Do not remove output files when errors are encountered. +@item -m +@itemx --suppress-matched +@opindex -m +@opindex --suppress-matched +Do not output lines matching the specified @var{pattern}. + @item -z @itemx --elide-empty-files @opindex -z @@ -3683,6 +3689,25 @@ $ head xx* 14 @end example +Example of splitting input by empty lines: + +@example +$ csplit --suppress-matched @var{input.txt} '/^$/' '@{*@}' +@end example + +@c +@c TODO: "uniq" already supportes "--group". +@c when it gets the "--key" option, uncomment this example. +@c +@c Example of splitting input file, based on the value of column 2: +@c +@c @example +@c $ cat @var{input.txt} | +@c sort -k2,2 | +@c uniq --group -k2,2 | +@c csplit -m '/^$/' '@{*@}' +@c @end example + @node Summarizing files @chapter Summarizing files diff --git a/src/csplit.c b/src/csplit.c index 22f3ad4..4ae2de2 100644 --- a/src/csplit.c +++ b/src/csplit.c @@ -166,6 +166,9 @@ static bool volatile remove_files; /* If true, remove all output files which have a zero length. */ static bool elide_empty_files; +/* If true, suppress the lines that match the PATTERN */ +static bool suppress_matched; + /* The compiled pattern arguments, which determine how to split the input file. */ static struct control *controls; @@ -185,6 +188,7 @@ static struct option const longopts[] = {"elide-empty-files", no_argument, NULL, 'z'}, {"prefix", required_argument, NULL, 'f'}, {"suffix-format", required_argument, NULL, 'b'}, + {"suppress-matched", no_argument, NULL, 'm'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -721,8 +725,13 @@ process_line_count (const struct control *p, uintmax_t repetition) create_output_file (); - linenum = get_first_line_in_buffer (); + /* Ensure that the line number specified is not 1 greater than + the number of lines in the file. + When suppressing matched lines, check before the loop. */ + if (no_more_lines () && suppress_matched) + handle_line_error (p, repetition); + linenum = get_first_line_in_buffer (); while (linenum++ < last_line_to_save) { line = remove_line (); @@ -733,9 +742,12 @@ process_line_count (const struct control *p, uintmax_t repetition) close_output_file (); + if (suppress_matched) + line = remove_line (); + /* Ensure that the line number specified is not 1 greater than the number of lines in the file. */ - if (no_more_lines ()) + if (no_more_lines () && !suppress_matched) handle_line_error (p, repetition); } @@ -778,6 +790,9 @@ process_regexp (struct control *p, uintmax_t repetition) if (!ignore) create_output_file (); + if (suppress_matched && current_line > 0) + line = remove_line (); + /* If there is no offset for the regular expression, or it is positive, then it is not necessary to buffer the lines. */ @@ -1324,9 +1339,10 @@ main (int argc, char **argv) control_used = 0; suppress_count = false; remove_files = true; + suppress_matched = false; prefix = DEFAULT_PREFIX; - while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1) + while ((optc = getopt_long (argc, argv, "f:b:kmn:sqz", longopts, NULL)) != -1) switch (optc) { case 'f': @@ -1341,6 +1357,10 @@ main (int argc, char **argv) remove_files = false; break; + case 'm': + suppress_matched = true; + break; + case 'n': if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK || MIN (INT_MAX, SIZE_MAX) < val) @@ -1465,6 +1485,9 @@ and output byte counts of each piece to standard output.\n\ -k, --keep-files do not remove output files on errors\n\ "), stdout); fputs (_("\ + -m, --suppress-matched suppress the lines matching PATTERN\n\ +"), stdout); + fputs (_("\ -n, --digits=DIGITS use specified number of digits instead of 2\n\ -s, --quiet, --silent do not print counts of output file sizes\n\ -z, --elide-empty-files remove empty output files\n\ diff --git a/tests/local.mk b/tests/local.mk index dc87ef4..e3a72ab 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -260,6 +260,7 @@ all_tests = \ tests/misc/csplit.sh \ tests/misc/csplit-1000.sh \ tests/misc/csplit-heap.sh \ + tests/misc/csplit-suppress-matched.pl \ tests/misc/date-sec.sh \ tests/misc/dircolors.pl \ tests/misc/dirname.pl \ diff --git a/tests/misc/csplit-suppress-matched.pl b/tests/misc/csplit-suppress-matched.pl new file mode 100644 index 0000000..512bdaa --- /dev/null +++ b/tests/misc/csplit-suppress-matched.pl @@ -0,0 +1,213 @@ +#!/usr/bin/perl + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see