bug-textutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

join uses too much RAM sometimes


From: Andy Jewell
Subject: join uses too much RAM sometimes
Date: Wed, 25 Oct 2000 21:11:24 -0400

When joining large files where one file has many many many instances of the same matching field, memory use is excessive (since join read all matching lines from both files into RAM before proceeding).

Below are diffs that solve this problem.

Andy Jewell
address@hidden


--- textutils-2.0/src/join.c    Sun Jul  4 03:38:02 1999
+++ join.c      Wed Oct 25 11:27:21 2000
@@ -486,6 +501,7 @@
   struct seq seq1, seq2;
   struct line line;
   int diff, i, j, eof1, eof2;
+  int end1, end2;

   /* Read the first line of each file.  */
   initseq (&seq1);
@@ -515,35 +531,85 @@
          continue;
        }

-      /* Keep reading lines from file1 as long as they continue to
-         match the current line from file2.  */
+ /* Read lines from file1 and file2 until one of them stops matching the other */
       eof1 = 0;
-      do
-       if (!getseq (fp1, &seq1))
-         {
-           eof1 = 1;
-           ++seq1.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
-      /* Keep reading lines from file2 as long as they continue to
-         match the current line from file1.  */
       eof2 = 0;
-      do
-       if (!getseq (fp2, &seq2))
-         {
-           eof2 = 1;
-           ++seq2.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+      end1 = 0;
+      end2 = 0;

-      if (print_pairables)
+      while (1)
        {
-         for (i = 0; i < seq1.count - 1; ++i)
+         if (!getseq (fp1, &seq1))
+           {
+             eof1 = 1;
+             end1 = 1;
+             ++seq1.count;
+             break;
+           }
+
+         if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+           {
+             end1 = 1;
+             break;
+           }
+
+         if (!getseq (fp2, &seq2))
+           {
+             eof2 = 1;
+             end2 = 1;
+             ++seq2.count;
+             break;
+           }
+         if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+           {
+             end2 = 1;
+             break;
+           }
+       }
+
+      if (end1)
+       {
+         for (i = 0; i < seq2.count; ++i)
+           {
+             for (j = 0; j < seq1.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+             freeline (&seq2.lines[i]);
+           }
+         while (1) {
+           seq2.count = 0;
+           if (!getseq (fp2, &seq2))
+             {
+               eof2 = 1;
+               ++seq2.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+           for (j = 0; j < seq1.count - 1; ++j)
+             if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+           freeline (&seq2.lines[0]);
+         }
+       }
+      else /* end2 */
+       {
+         for (i = 0; i < seq1.count; ++i)
+           {
+             for (j = 0; j < seq2.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+             freeline (&seq1.lines[i]);
+           }
+         while (1) {
+           seq1.count = 0;
+           if (!getseq (fp1, &seq1))
+             {
+               eof1 = 1;
+               ++seq1.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
            for (j = 0; j < seq2.count - 1; ++j)
-             prjoin (&seq1.lines[i], &seq2.lines[j]);
+             if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+           freeline (&seq1.lines[0]);
+         }
        }

       for (i = 0; i < seq1.count - 1; ++i)
@@ -555,7 +621,7 @@
        }
       else
        seq1.count = 0;
-
+
       for (i = 0; i < seq2.count - 1; ++i)
        freeline (&seq2.lines[i]);
       if (!eof2)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]