bug-textutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

join can fail on large files


From: Andrew D Jewell
Subject: join can fail on large files
Date: Thu, 14 Feb 2002 13:23:13 -0500

The way join works is this :

For a given match, read all the matching lines from both files into memory, and then print out the join results.

The problem :

If you're joining really big files (hundreds of gigabytes), the matching lines from one file or the other can exceed your process limit (no more than 2 gig, usually)

The solution :

Read the two lists in parallel, until one is done, and then use the other, one line at a time, from disk.

The result :

No performance hit that I can find.
Peak memory requirement vastly reduced.
Some joins work that previously crashed.

The patch (from join.c in 2.0.20) :

--- src/join.c-orig  Wed Feb 13 09:35:13 2002
+++ src/join.c      Thu Feb 14 10:05:00 2002
@@ -494,6 +494,7 @@
   struct seq seq1, seq2;
   struct line line;
   int diff, i, j, eof1, eof2;
+  int end1, end2;

   /* Read the first line of each file.  */
   initseq (&seq1);
@@ -523,35 +524,85 @@
          continue;
        }

-      /* Keep reading lines from file1 as long as they continue to
-         match the current line from file2.  */
+ /* Read lines from file1 and file2 until one of them stops matching the other */
       eof1 = 0;
-      do
-       if (!getseq (fp1, &seq1))
-         {
-           eof1 = 1;
-           ++seq1.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
-      /* Keep reading lines from file2 as long as they continue to
-         match the current line from file1.  */
       eof2 = 0;
-      do
-       if (!getseq (fp2, &seq2))
-         {
-           eof2 = 1;
-           ++seq2.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+      end1 = 0;
+      end2 = 0;
+
+      while (1)
+       {
+         if (!getseq (fp1, &seq1))
+           {
+             eof1 = 1;
+             end1 = 1;
+             ++seq1.count;
+             break;
+           }
+
+         if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+           {
+             end1 = 1;
+             break;
+           }

-      if (print_pairables)
+         if (!getseq (fp2, &seq2))
+           {
+             eof2 = 1;
+             end2 = 1;
+             ++seq2.count;
+             break;
+           }
+         if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+           {
+             end2 = 1;
+             break;
+           }
+       }
+
+      if (end1)
+       {
+         for (i = 0; i < seq2.count; ++i)
+           {
+             for (j = 0; j < seq1.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+             freeline (&seq2.lines[i]);
+           }
+         while (1) {
+           seq2.count = 0;
+           if (!getseq (fp2, &seq2))
+             {
+               eof2 = 1;
+               ++seq2.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+           for (j = 0; j < seq1.count - 1; ++j)
+             if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+           freeline (&seq2.lines[0]);
+         }
+       }
+      else /* end2 */
        {
-         for (i = 0; i < seq1.count - 1; ++i)
+         for (i = 0; i < seq1.count; ++i)
+           {
+             for (j = 0; j < seq2.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+             freeline (&seq1.lines[i]);
+           }
+         while (1) {
+           seq1.count = 0;
+           if (!getseq (fp1, &seq1))
+             {
+               eof1 = 1;
+               ++seq1.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
            for (j = 0; j < seq2.count - 1; ++j)
-             prjoin (&seq1.lines[i], &seq2.lines[j]);
+             if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+           freeline (&seq1.lines[0]);
+         }
        }

       for (i = 0; i < seq1.count - 1; ++i)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]