[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
join uses too much RAM sometimes
From: |
Andy Jewell |
Subject: |
join uses too much RAM sometimes |
Date: |
Wed, 25 Oct 2000 21:11:24 -0400 |
When joining large files where one file has many many many instances
of the same matching field, memory use is excessive (since join read
all matching lines from both files into RAM before proceeding).
Below are diffs that solve this problem.
Andy Jewell
address@hidden
--- textutils-2.0/src/join.c Sun Jul 4 03:38:02 1999
+++ join.c Wed Oct 25 11:27:21 2000
@@ -486,6 +501,7 @@
struct seq seq1, seq2;
struct line line;
int diff, i, j, eof1, eof2;
+ int end1, end2;
/* Read the first line of each file. */
initseq (&seq1);
@@ -515,35 +531,85 @@
continue;
}
- /* Keep reading lines from file1 as long as they continue to
- match the current line from file2. */
+ /* Read lines from file1 and file2 until one of them stops
matching the other */
eof1 = 0;
- do
- if (!getseq (fp1, &seq1))
- {
- eof1 = 1;
- ++seq1.count;
- break;
- }
- while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
- /* Keep reading lines from file2 as long as they continue to
- match the current line from file1. */
eof2 = 0;
- do
- if (!getseq (fp2, &seq2))
- {
- eof2 = 1;
- ++seq2.count;
- break;
- }
- while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+ end1 = 0;
+ end2 = 0;
- if (print_pairables)
+ while (1)
{
- for (i = 0; i < seq1.count - 1; ++i)
+ if (!getseq (fp1, &seq1))
+ {
+ eof1 = 1;
+ end1 = 1;
+ ++seq1.count;
+ break;
+ }
+
+ if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+ {
+ end1 = 1;
+ break;
+ }
+
+ if (!getseq (fp2, &seq2))
+ {
+ eof2 = 1;
+ end2 = 1;
+ ++seq2.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+ {
+ end2 = 1;
+ break;
+ }
+ }
+
+ if (end1)
+ {
+ for (i = 0; i < seq2.count; ++i)
+ {
+ for (j = 0; j < seq1.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+ freeline (&seq2.lines[i]);
+ }
+ while (1) {
+ seq2.count = 0;
+ if (!getseq (fp2, &seq2))
+ {
+ eof2 = 1;
+ ++seq2.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+ for (j = 0; j < seq1.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+ freeline (&seq2.lines[0]);
+ }
+ }
+ else /* end2 */
+ {
+ for (i = 0; i < seq1.count; ++i)
+ {
+ for (j = 0; j < seq2.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+ freeline (&seq1.lines[i]);
+ }
+ while (1) {
+ seq1.count = 0;
+ if (!getseq (fp1, &seq1))
+ {
+ eof1 = 1;
+ ++seq1.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
for (j = 0; j < seq2.count - 1; ++j)
- prjoin (&seq1.lines[i], &seq2.lines[j]);
+ if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+ freeline (&seq1.lines[0]);
+ }
}
for (i = 0; i < seq1.count - 1; ++i)
@@ -555,7 +621,7 @@
}
else
seq1.count = 0;
-
+
for (i = 0; i < seq2.count - 1; ++i)
freeline (&seq2.lines[i]);
if (!eof2)
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- join uses too much RAM sometimes,
Andy Jewell <=