join can fail on large files

bug-textutils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

join can fail on large files

From:	Andrew D Jewell
Subject:	join can fail on large files
Date:	Thu, 14 Feb 2002 13:23:13 -0500

The way join works is this :

For a given match, read all the matching lines from both files intomemory, and then print out the join results.


The problem :

If you're joining really big files (hundreds of gigabytes), thematching lines from one file or the other can exceed your processlimit (no more than 2 gig, usually)


The solution :

Read the two lists in parallel, until one is done, and then use theother, one line at a time, from disk.


The result :

No performance hit that I can find.
Peak memory requirement vastly reduced.
Some joins work that previously crashed.

The patch (from join.c in 2.0.20) :

--- src/join.c-orig  Wed Feb 13 09:35:13 2002
+++ src/join.c      Thu Feb 14 10:05:00 2002
@@ -494,6 +494,7 @@
   struct seq seq1, seq2;
   struct line line;
   int diff, i, j, eof1, eof2;
+  int end1, end2;

   /* Read the first line of each file.  */
   initseq (&seq1);
@@ -523,35 +524,85 @@
          continue;
        }

-      /* Keep reading lines from file1 as long as they continue to
-         match the current line from file2.  */

+ /* Read lines from file1 and file2 until one of them stopsmatching the other */

       eof1 = 0;
-      do
-       if (!getseq (fp1, &seq1))
-         {
-           eof1 = 1;
-           ++seq1.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
-      /* Keep reading lines from file2 as long as they continue to
-         match the current line from file1.  */
       eof2 = 0;
-      do
-       if (!getseq (fp2, &seq2))
-         {
-           eof2 = 1;
-           ++seq2.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+      end1 = 0;
+      end2 = 0;
+
+      while (1)
+       {
+         if (!getseq (fp1, &seq1))
+           {
+             eof1 = 1;
+             end1 = 1;
+             ++seq1.count;
+             break;
+           }
+
+         if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+           {
+             end1 = 1;
+             break;
+           }

-      if (print_pairables)
+         if (!getseq (fp2, &seq2))
+           {
+             eof2 = 1;
+             end2 = 1;
+             ++seq2.count;
+             break;
+           }
+         if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+           {
+             end2 = 1;
+             break;
+           }
+       }
+
+      if (end1)
+       {
+         for (i = 0; i < seq2.count; ++i)
+           {
+             for (j = 0; j < seq1.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+             freeline (&seq2.lines[i]);
+           }
+         while (1) {
+           seq2.count = 0;
+           if (!getseq (fp2, &seq2))
+             {
+               eof2 = 1;
+               ++seq2.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+           for (j = 0; j < seq1.count - 1; ++j)
+             if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+           freeline (&seq2.lines[0]);
+         }
+       }
+      else /* end2 */
        {
-         for (i = 0; i < seq1.count - 1; ++i)
+         for (i = 0; i < seq1.count; ++i)
+           {
+             for (j = 0; j < seq2.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+             freeline (&seq1.lines[i]);
+           }
+         while (1) {
+           seq1.count = 0;
+           if (!getseq (fp1, &seq1))
+             {
+               eof1 = 1;
+               ++seq1.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
            for (j = 0; j < seq2.count - 1; ++j)
-             prjoin (&seq1.lines[i], &seq2.lines[j]);
+             if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+           freeline (&seq1.lines[0]);
+         }
        }

       for (i = 0; i < seq1.count - 1; ++i)

[Prev in Thread]

Current Thread

[Next in Thread]

join can fail on large files, Andrew D Jewell <=

Prev by Date: Re: md5sum suggestion
Next by Date: UNIX Utilities For NT
Previous by thread: md5sum suggestion
Next by thread: UNIX Utilities For NT
Index(es):
- Date
- Thread