[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
join can fail on large files
From: |
Andrew D Jewell |
Subject: |
join can fail on large files |
Date: |
Thu, 14 Feb 2002 13:23:13 -0500 |
The way join works is this :
For a given match, read all the matching lines from both files into
memory, and then print out the join results.
The problem :
If you're joining really big files (hundreds of gigabytes), the
matching lines from one file or the other can exceed your process
limit (no more than 2 gig, usually)
The solution :
Read the two lists in parallel, until one is done, and then use the
other, one line at a time, from disk.
The result :
No performance hit that I can find.
Peak memory requirement vastly reduced.
Some joins work that previously crashed.
The patch (from join.c in 2.0.20) :
--- src/join.c-orig Wed Feb 13 09:35:13 2002
+++ src/join.c Thu Feb 14 10:05:00 2002
@@ -494,6 +494,7 @@
struct seq seq1, seq2;
struct line line;
int diff, i, j, eof1, eof2;
+ int end1, end2;
/* Read the first line of each file. */
initseq (&seq1);
@@ -523,35 +524,85 @@
continue;
}
- /* Keep reading lines from file1 as long as they continue to
- match the current line from file2. */
+ /* Read lines from file1 and file2 until one of them stops
matching the other */
eof1 = 0;
- do
- if (!getseq (fp1, &seq1))
- {
- eof1 = 1;
- ++seq1.count;
- break;
- }
- while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
- /* Keep reading lines from file2 as long as they continue to
- match the current line from file1. */
eof2 = 0;
- do
- if (!getseq (fp2, &seq2))
- {
- eof2 = 1;
- ++seq2.count;
- break;
- }
- while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+ end1 = 0;
+ end2 = 0;
+
+ while (1)
+ {
+ if (!getseq (fp1, &seq1))
+ {
+ eof1 = 1;
+ end1 = 1;
+ ++seq1.count;
+ break;
+ }
+
+ if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+ {
+ end1 = 1;
+ break;
+ }
- if (print_pairables)
+ if (!getseq (fp2, &seq2))
+ {
+ eof2 = 1;
+ end2 = 1;
+ ++seq2.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+ {
+ end2 = 1;
+ break;
+ }
+ }
+
+ if (end1)
+ {
+ for (i = 0; i < seq2.count; ++i)
+ {
+ for (j = 0; j < seq1.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+ freeline (&seq2.lines[i]);
+ }
+ while (1) {
+ seq2.count = 0;
+ if (!getseq (fp2, &seq2))
+ {
+ eof2 = 1;
+ ++seq2.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+ for (j = 0; j < seq1.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+ freeline (&seq2.lines[0]);
+ }
+ }
+ else /* end2 */
{
- for (i = 0; i < seq1.count - 1; ++i)
+ for (i = 0; i < seq1.count; ++i)
+ {
+ for (j = 0; j < seq2.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+ freeline (&seq1.lines[i]);
+ }
+ while (1) {
+ seq1.count = 0;
+ if (!getseq (fp1, &seq1))
+ {
+ eof1 = 1;
+ ++seq1.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
for (j = 0; j < seq2.count - 1; ++j)
- prjoin (&seq1.lines[i], &seq2.lines[j]);
+ if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+ freeline (&seq1.lines[0]);
+ }
}
for (i = 0; i < seq1.count - 1; ++i)
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- join can fail on large files,
Andrew D Jewell <=