Index: doc/stamp-vti =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/stamp-vti,v retrieving revision 1.4 diff -u -r1.4 stamp-vti --- doc/stamp-vti 2001/01/28 04:40:19 1.4 +++ doc/stamp-vti 2001/04/08 11:42:02 @@ -1,3 +1,3 @@ address@hidden UPDATED 28 January 2001 address@hidden UPDATED 8 April 2001 @set EDITION 2.0 @set VERSION 2.0 Index: doc/textutils.info =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/textutils.info,v retrieving revision 1.5 diff -u -r1.5 textutils.info --- doc/textutils.info 2001/01/28 04:40:19 1.5 +++ doc/textutils.info 2001/04/08 11:42:02 @@ -2621,6 +2621,10 @@ `-j FIELD' Equivalent to `-1 FIELD -2 FIELD'. +`-n' + Use numerical order when joining FILE1 and FILE2. They must be + sorted numerically beforehand. + `-o FIELD-LIST...' Construct each output line according to the format in FIELD-LIST. Each element in FIELD-LIST is either the single character `0' or @@ -3773,6 +3777,7 @@ * -M: sort invocation. * -m <1>: sort invocation. * -m: pr invocation. +* -n <1>: join invocation. * -n: cut invocation. * -N: uniq invocation. * -n <1>: sort invocation. @@ -4069,23 +4074,23 @@ Node: cut invocation98383 Node: paste invocation100298 Node: join invocation101152 -Node: Operating on characters104536 -Node: tr invocation104982 -Node: Character sets106099 -Node: Translating109690 -Node: Squeezing111487 -Node: Warnings in tr113394 -Node: expand invocation114527 -Node: unexpand invocation115844 -Node: Opening the software toolbox117279 -Node: Toolbox introduction117967 -Node: I/O redirection120689 -Node: The who command123525 -Node: The cut command124413 -Node: The sort command125288 -Node: The uniq command125992 -Node: Putting the tools together126721 -Ref: Putting the tools together-Footnote-1138549 -Node: Index138711 +Node: Operating on characters104647 +Node: tr invocation105093 +Node: Character sets106210 +Node: Translating109801 +Node: Squeezing111598 +Node: Warnings in tr113505 +Node: expand invocation114638 +Node: unexpand invocation115955 +Node: Opening the software toolbox117390 +Node: Toolbox introduction118078 +Node: I/O redirection120800 +Node: The who command123636 +Node: The cut command124524 +Node: The sort command125399 +Node: The uniq command126103 +Node: Putting the tools together126832 +Ref: Putting the tools together-Footnote-1138660 +Node: Index138822  End Tag Table Index: doc/textutils.texi =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/textutils.texi,v retrieving revision 1.5 diff -u -r1.5 textutils.texi --- doc/textutils.texi 2001/01/28 04:40:19 1.5 +++ doc/textutils.texi 2001/04/08 11:41:33 @@ -3297,6 +3297,11 @@ @item -j @var{field} Equivalent to @samp{-1 @var{field} -2 @var{field}}. address@hidden -n address@hidden -n +Use numerical order when joining @var{file1} and @var{file2}. They must be +sorted numerically beforehand. + @item -o @address@hidden Construct each output line according to the format in @var{field-list}. Each element in @var{field-list} is either the single character @samp{0} or Index: doc/version.texi =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/version.texi,v retrieving revision 1.4 diff -u -r1.4 version.texi --- doc/version.texi 2001/01/28 04:40:19 1.4 +++ doc/version.texi 2001/04/08 11:42:02 @@ -1,3 +1,3 @@ address@hidden UPDATED 28 January 2001 address@hidden UPDATED 8 April 2001 @set EDITION 2.0 @set VERSION 2.0 Index: man/join.1 =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/man/join.1,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 join.1 --- man/join.1 1999/08/06 19:24:08 1.1.1.1 +++ man/join.1 2001/04/08 11:37:19 @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.012. -.TH JOIN "1" "August 1999" "GNU textutils 2.0" FSF +.TH JOIN "1" "April 2001" "GNU textutils 2.0" FSF .SH NAME join \- join lines of two files on a common field .SH SYNOPSIS @@ -23,6 +23,7 @@ \fB\-j\fR FIELD (obsolescent) equivalent to `-1 FIELD \fB\-2\fR FIELD' \fB\-j1\fR FIELD (obsolescent) equivalent to `-1 FIELD' \fB\-j2\fR FIELD (obsolescent) equivalent to `-2 FIELD' +\fB\-n\fR input files are sorted numerically \fB\-o\fR FORMAT obey FORMAT while constructing output line \fB\-t\fR CHAR use CHAR as input and output field separator \fB\-v\fR SIDE like \fB\-a\fR SIDE, but suppress joined output lines Index: po/cat-id-tbl.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/po/cat-id-tbl.c,v retrieving revision 1.2 diff -u -r1.2 cat-id-tbl.c --- po/cat-id-tbl.c 2000/06/28 11:20:30 1.2 +++ po/cat-id-tbl.c 2001/04/08 11:37:19 @@ -210,6 +210,7 @@ -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\ -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\ -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\ + -n input files are sorted numerically\n\ -o FORMAT obey FORMAT while constructing output line\n\ -t CHAR use CHAR as input and output field separator\n\ -v SIDE like -a SIDE, but suppress joined output lines\n\ Index: po/textutils.pot =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/po/textutils.pot,v retrieving revision 1.2 diff -u -r1.2 textutils.pot --- po/textutils.pot 2000/06/28 11:20:30 1.2 +++ po/textutils.pot 2001/04/08 11:37:19 @@ -6,7 +6,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" -"POT-Creation-Date: 2000-06-28 21:17+1000\n" +"POT-Creation-Date: 2001-04-08 21:37+1000\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -15,7 +15,7 @@ "Content-Transfer-Encoding: ENCODING\n" #: src/cat.c:84 src/cksum.c:265 src/comm.c:70 src/csplit.c:1502 src/cut.c:193 -#: src/expand.c:104 src/fmt.c:268 src/fold.c:61 src/head.c:79 src/join.c:141 +#: src/expand.c:104 src/fmt.c:268 src/fold.c:61 src/head.c:79 src/join.c:161 #: src/md5sum.c:100 src/nl.c:171 src/od.c:262 src/paste.c:405 src/pr.c:2772 #: src/ptx.c:1854 src/sort.c:251 src/split.c:83 src/sum.c:57 src/tac.c:124 #: src/tail.c:208 src/tr.c:321 src/tsort.c:88 src/unexpand.c:357 @@ -58,7 +58,7 @@ msgstr "" #: src/cat.c:117 src/cksum.c:279 src/comm.c:87 src/csplit.c:1533 src/cut.c:225 -#: src/expand.c:124 src/fmt.c:289 src/fold.c:79 src/head.c:103 src/join.c:175 +#: src/expand.c:124 src/fmt.c:289 src/fold.c:79 src/head.c:103 src/join.c:196 #: src/md5sum.c:126 src/nl.c:213 src/od.c:327 src/paste.c:424 src/pr.c:2859 #: src/sort.c:298 src/split.c:106 src/sum.c:75 src/tac.c:142 src/tail.c:261 #: src/tr.c:383 src/unexpand.c:377 src/uniq.c:136 src/wc.c:100 @@ -70,7 +70,7 @@ #: src/cat.c:177 src/cat.c:259 src/cat.c:312 src/cat.c:840 src/comm.c:220 #: src/csplit.c:1493 src/cut.c:799 src/expand.c:392 src/fmt.c:416 #: src/fold.c:225 src/fold.c:306 src/head.c:139 src/head.c:169 src/head.c:387 -#: src/join.c:871 src/md5sum.c:629 src/nl.c:608 src/od.c:1940 src/paste.c:485 +#: src/join.c:1131 src/md5sum.c:629 src/nl.c:608 src/od.c:1940 src/paste.c:485 #: src/pr.c:1158 src/tac.c:715 src/tail.c:285 src/tail.c:1518 src/tr.c:1670 #: src/tr.c:1916 src/tr.c:2024 src/tr.c:2031 src/tsort.c:485 #: src/unexpand.c:454 @@ -452,12 +452,12 @@ msgid "unrecognized option `-%c'" msgstr "" -#: src/join.c:145 +#: src/join.c:165 #, c-format msgid "Usage: %s [OPTION]... FILE1 FILE2\n" msgstr "" -#: src/join.c:149 +#: src/join.c:169 msgid "" "For each pair of input lines with identical join fields, write a line to\n" "standard output. The default join field is the first, delimited\n" @@ -469,6 +469,7 @@ " -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n" " -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n" " -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n" +" -n input files are sorted numerically\n" " -o FORMAT obey FORMAT while constructing output line\n" " -t CHAR use CHAR as input and output field separator\n" " -v SIDE like -a SIDE, but suppress joined output lines\n" @@ -486,40 +487,40 @@ msgstr "" #. `0' must be all alone -- no `.FIELD'. -#: src/join.c:640 +#: src/join.c:879 #, c-format msgid "invalid field specifier: `%s'" msgstr "" -#: src/join.c:654 src/join.c:767 src/join.c:803 +#: src/join.c:893 src/join.c:1023 src/join.c:1063 #, c-format msgid "invalid field number: `%s'" msgstr "" -#: src/join.c:667 +#: src/join.c:906 #, c-format msgid "invalid file number in field spec: `%s'" msgstr "" -#: src/join.c:787 +#: src/join.c:1047 #, c-format msgid "invalid field number for file 1: `%s'" msgstr "" -#: src/join.c:796 +#: src/join.c:1056 #, c-format msgid "invalid field number for file 2: `%s'" msgstr "" -#: src/join.c:828 +#: src/join.c:1088 msgid "too many non-option arguments" msgstr "" -#: src/join.c:850 +#: src/join.c:1110 msgid "too few non-option arguments" msgstr "" -#: src/join.c:861 +#: src/join.c:1121 msgid "both files cannot be standard input" msgstr "" Index: src/join.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/src/join.c,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 join.c --- src/join.c 1999/07/04 10:38:02 1.1.1.1 +++ src/join.c 2001/04/08 11:35:34 @@ -62,7 +62,7 @@ /* A field of a line. */ struct field { - const unsigned char *beg; /* First character in field. */ + unsigned char *beg; /* First character in field. */ size_t len; /* The length of the field. */ }; @@ -87,9 +87,25 @@ /* The name this program was run with. */ char *program_name; +#define C_DECIMAL_POINT '.' +#define NEGATION_SIGN '-' +#define NUMERIC_ZERO '0' + #ifdef ENABLE_NLS + +static char decimal_point; +static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */ + /* Nonzero if the LC_COLLATE locale is hard. */ static int hard_LC_COLLATE; + +# define IS_THOUSANDS_SEP(x) ((x) == th_sep) + +#else + +# define decimal_point C_DECIMAL_POINT +# define IS_THOUSANDS_SEP(x) 0 + #endif /* If nonzero, print unpairable lines in file 1 or 2. */ @@ -123,6 +139,7 @@ {"j", required_argument, NULL, 'j'}, {"j1", required_argument, NULL, '1'}, {"j2", required_argument, NULL, '2'}, + {"n", required_argument, NULL, 'n'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -134,6 +151,9 @@ /* If nonzero, ignore case when comparing join fields. */ static int ignore_case; +/* If nonzero, do numeric comparison. */ +static int numeric; + void usage (int status) { @@ -157,6 +177,7 @@ -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\ -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\ -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\ + -n input files are sorted numerically\n\ -o FORMAT obey FORMAT while constructing output line\n\ -t CHAR use CHAR as input and output field separator\n\ -v SIDE like -a SIDE, but suppress joined output lines\n\ @@ -178,7 +199,7 @@ } static void -ADD_FIELD (struct line *line, const unsigned char *field, size_t len) +ADD_FIELD (struct line *line, unsigned char *field, size_t len) { if (line->nfields >= line->nfields_allocated) { @@ -314,6 +335,213 @@ free ((char *) seq->lines); } +/* Compare strings A and B containing decimal fractions < 1. Each string + should begin with a decimal point followed immediately by the digits + of the fraction. Strings not of this form are considered to be zero. */ + +/* The goal here, is to take two numbers a and b... compare these + in parallel. Instead of converting each, and then comparing the + outcome. Most likely stopping the comparison before the conversion + is complete. The algorithm used, in the old sort: + + Algorithm: fraccompare + Action : compare two decimal fractions + accepts : char *a, char *b + returns : -1 if ab. + implement: + + if *a == decimal_point AND *b == decimal_point + find first character different in a and b. + if both are digits, return the difference *a - *b. + if *a is a digit + skip past zeros + if digit return 1, else 0 + if *b is a digit + skip past zeros + if digit return -1, else 0 + if *a is a decimal_point + skip past decimal_point and zeros + if digit return 1, else 0 + if *b is a decimal_point + skip past decimal_point and zeros + if digit return -1, else 0 + return 0 */ + +static int +fraccompare (register const unsigned char *a, register const unsigned char *b) +{ + if (*a == decimal_point && *b == decimal_point) + { + while (*++a == *++b) + if (! ISDIGIT (*a)) + return 0; + if (ISDIGIT (*a) && ISDIGIT (*b)) + return *a - *b; + if (ISDIGIT (*a)) + goto a_trailing_nonzero; + if (ISDIGIT (*b)) + goto b_trailing_nonzero; + return 0; + } + else if (*a++ == decimal_point) + { + a_trailing_nonzero: + while (*a == NUMERIC_ZERO) + a++; + return ISDIGIT (*a); + } + else if (*b++ == decimal_point) + { + b_trailing_nonzero: + while (*b == NUMERIC_ZERO) + b++; + return - ISDIGIT (*b); + } + return 0; +} + +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. */ + +static int +numcompare (register const unsigned char *a, register const unsigned char *b) +{ + register int tmpa, tmpb, loga, logb, tmp; + + tmpa = *a; + tmpb = *b; + + while (ISBLANK (tmpa)) + tmpa = *++a; + while (ISBLANK (tmpb)) + tmpb = *++b; + + if (tmpa == NEGATION_SIGN) + { + do + tmpa = *++a; + while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa)); + if (tmpb != NEGATION_SIGN) + { + if (tmpa == decimal_point) + do + tmpa = *++a; + while (tmpa == NUMERIC_ZERO); + if (ISDIGIT (tmpa)) + return -1; + while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb)) + tmpb = *++b; + if (tmpb == decimal_point) + do + tmpb = *++b; + while (tmpb == NUMERIC_ZERO); + if (ISDIGIT (tmpb)) + return -1; + return 0; + } + do + tmpb = *++b; + while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb)); + + while (tmpa == tmpb && ISDIGIT (tmpa)) + { + do + tmpa = *++a; + while (IS_THOUSANDS_SEP (tmpa)); + do + tmpb = *++b; + while (IS_THOUSANDS_SEP (tmpb)); + } + + if ((tmpa == decimal_point && !ISDIGIT (tmpb)) + || (tmpb == decimal_point && !ISDIGIT (tmpa))) + return -fraccompare (a, b); + + tmp = tmpb - tmpa; + + for (loga = 0; ISDIGIT (tmpa); ++loga) + do + tmpa = *++a; + while (IS_THOUSANDS_SEP (tmpa)); + + for (logb = 0; ISDIGIT (tmpb); ++logb) + do + tmpb = *++b; + while (IS_THOUSANDS_SEP (tmpb)); + + if (logb - loga != 0) + return logb - loga; + + if (!loga) + return 0; + + return tmp; + } + else if (tmpb == NEGATION_SIGN) + { + do + tmpb = *++b; + while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb)); + if (tmpb == decimal_point) + do + tmpb = *++b; + while (tmpb == NUMERIC_ZERO); + if (ISDIGIT (tmpb)) + return 1; + while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa)) + tmpa = *++a; + if (tmpa == decimal_point) + do + tmpa = *++a; + while (tmpa == NUMERIC_ZERO); + if (ISDIGIT (tmpa)) + return 1; + return 0; + } + else + { + while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa)) + tmpa = *++a; + while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb)) + tmpb = *++b; + + while (tmpa == tmpb && ISDIGIT (tmpa)) + { + do + tmpa = *++a; + while (IS_THOUSANDS_SEP (tmpa)); + do + tmpb = *++b; + while (IS_THOUSANDS_SEP (tmpb)); + } + + if ((tmpa == decimal_point && !ISDIGIT (tmpb)) + || (tmpb == decimal_point && !ISDIGIT (tmpa))) + return fraccompare (a, b); + + tmp = tmpa - tmpb; + + for (loga = 0; ISDIGIT (tmpa); ++loga) + do + tmpa = *++a; + while (IS_THOUSANDS_SEP (tmpa)); + + for (logb = 0; ISDIGIT (tmpb); ++logb) + do + tmpb = *++b; + while (IS_THOUSANDS_SEP (tmpb)); + + if (loga - logb != 0) + return loga - logb; + + if (!loga) + return 0; + + return tmp; + } +} + /* Return <0 if the join field in LINE1 compares less than the one in LINE2; >0 if it compares greater; 0 if it compares equal. */ @@ -321,7 +549,7 @@ keycmp (struct line *line1, struct line *line2) { /* Start of field to compare in each file. */ - const unsigned char *beg1, *beg2; + unsigned char *beg1, *beg2; int len1, len2; /* Length of fields to compare. */ int diff; @@ -356,8 +584,19 @@ /* Use an if-statement here rather than a function variable to avoid portability hassles of getting a non-conflicting declaration of memcmp. */ - if (ignore_case) + if (numeric) { + unsigned char save1, save2; + + save1 = beg1[len1]; + save2 = beg2[len2]; + beg1[len1] = beg2[len2] = '\0'; + diff = numcompare(beg1, beg2); + beg1[len1] = save1; + beg2[len2] = save2; + } + else if (ignore_case) + { /* FIXME: ignore_case does not work with NLS (in particular, with multibyte chars). */ diff = memcasecmp (beg1, beg2, min (len1, len2)); @@ -738,6 +977,23 @@ #ifdef ENABLE_NLS hard_LC_COLLATE = hard_locale (LC_COLLATE); + + /* Let's get locale's representation of the decimal point */ + { + struct lconv *lconvp = localeconv (); + + /* If the locale doesn't define a decimal point, or if the decimal + point is multibyte, use the C decimal point. We don't support + multibyte decimal points yet. */ + decimal_point = *lconvp->decimal_point; + if (! decimal_point || lconvp->decimal_point[1]) + decimal_point = C_DECIMAL_POINT; + + /* We don't support multibyte thousands separators yet. */ + th_sep = *lconvp->thousands_sep; + if (! th_sep || lconvp->thousands_sep[1]) + th_sep = CHAR_MAX + 1; + } #endif /* Initialize this before parsing options. In parsing options, @@ -747,7 +1003,7 @@ nfiles = 0; print_pairables = 1; - while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts, + while ((optc = getopt_long_only (argc, argv, "-a:e:in1:2:o:t:v:", longopts, NULL)) != -1) { long int val; @@ -777,6 +1033,10 @@ case 'i': ignore_case = 1; + break; + + case 'n': + numeric = 1; break; case '1':