bug-textutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] MAJOR speed increase for comm/join/nl/tail/uniq


From: Padraig Brady
Subject: [PATCH] MAJOR speed increase for comm/join/nl/tail/uniq
Date: Mon, 24 Sep 2001 20:53:47 +0100
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.4) Gecko/20010913

I noticed that textutils/lib/linebuffer.c uses a very inefficient algorithm.
Essentially it calls getc() for every character in the stream!!
I changed this to use fgets() instead. The speedup will be greater the
longer the lines are, but for "standard" text files it gives a 220%
speedup on my RH7.1 system. Note the potential speedups are
much greater on other systems. For e.g. I've experienced (but not
tested yet) getc() on windoze taking a huge amount of time relative
to the linux implementation due to it's locking mechanisms. You
can do a quick check by: time cat "loads of text files" | uniq
I also added in an optimized special case version when you know
there won't by NULLs in the input stream.

Does any other GNU software use linebuffer?

to apply the patch:
cd textutils; patch -p1 < textutils-linebuffer-speedup.diff

cheers,
Padraig.
diff -aru -x *.o textutils-2.0.15/lib/linebuffer.c 
textutils-2.0.15_pb/lib/linebuffer.c
--- textutils-2.0.15/lib/linebuffer.c   Mon Aug  7 16:48:18 2000
+++ textutils-2.0.15_pb/lib/linebuffer.c        Mon Sep 24 20:48:14 2001
@@ -15,13 +15,15 @@
    along with this program; if not, write to the Free Software Foundation,
    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
 
-/* Written by Richard Stallman. */
+/* Written by Richard Stallman and Padraig Brady */
 
 #ifdef HAVE_CONFIG_H
 # include <config.h>
 #endif
 
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include <sys/types.h>
 #include "linebuffer.h"
 
@@ -29,57 +31,123 @@
 char *xrealloc ();
 void free ();
 
-/* Initialize linebuffer LINEBUFFER for use. */
+/* Should really allow this to be passed to initbuffer */
+#define INITIAL_LINE_LENGTH 200 
 
+/* Initialize linebuffer LINEBUFFER for use. */
 void
 initbuffer (struct linebuffer *linebuffer)
 {
-  linebuffer->length = 0;
-  linebuffer->size = 200;
-  linebuffer->buffer = (char *) xmalloc (linebuffer->size);
+    linebuffer->length = 0;
+    linebuffer->size = INITIAL_LINE_LENGTH;
+    linebuffer->buffer = (char *) xmalloc (linebuffer->size);
 }
 
 /* Read an arbitrarily long line of text from STREAM into LINEBUFFER.
    Keep the newline; append a newline if it's the last line of a file
-   that ends in a non-newline character.  Do not null terminate.
-   Return LINEBUFFER, except at end of file return 0.  */
-
+   that ends in a non-newline character. Do not null terminate.
+   Therefore the stream can contain NULLs and the length (including 
+   the newline) is returned in linebuffer->size.
+   Return LINEBUFFER, except at end of file or error return NULL.*/
 struct linebuffer *
 readline (struct linebuffer *linebuffer, FILE *stream)
 {
-  int c;
-  char *buffer = linebuffer->buffer;
-  char *p = linebuffer->buffer;
-  char *end = buffer + linebuffer->size; /* Sentinel. */
-
-  if (feof (stream) || ferror (stream))
-    return 0;
-
-  do
-    {
-      c = getc (stream);
-      if (c == EOF)
-       {
-         if (p == buffer)
-           return 0;
-         if (p[-1] == '\n')
-           break;
-         c = '\n';
-       }
-      if (p == end)
-       {
-         linebuffer->size *= 2;
-         buffer = (char *) xrealloc (buffer, linebuffer->size);
-         p = p - linebuffer->buffer + buffer;
-         linebuffer->buffer = buffer;
-         end = buffer + linebuffer->size;
-       }
-      *p++ = c;
+    size_t fgetSize;
+    char* fgetPos;
+    char* newline;
+    char* fgetsRet;
+    char* buf;
+
+    fgetPos = buf = linebuffer->buffer;
+    fgetSize = linebuffer->size;
+
+    if (feof(stream) || ferror(stream))
+        return NULL;
+
+    while(1) {
+        memset(fgetPos, '\1', fgetSize); /* Can be anything except 0 */
+        if ((fgetsRet=fgets(fgetPos, (int) fgetSize, stream))) {
+            newline=memchr(fgetPos, '\n', fgetSize);
+            if (!newline) { /* No \n => buffer possibly full */
+                if (buf[linebuffer->size-1] == '\0') { /*buffer full*/
+                    if (!feof(stream)) {
+                        fgetSize = linebuffer->size + 1;
+                        linebuffer->size *= 2;
+                           buf = (char *) xrealloc (buf, linebuffer->size);
+                        linebuffer->buffer = buf;
+                        fgetPos = buf + fgetSize -2;
+                        continue;
+                    } else {
+                        /* small chance incomplete line fit buffer exactly */
+                        newline = &buf[linebuffer->size-1];
+                        *newline = '\n';
+                    }
+                } else { /* incomplete line received */
+                    /* why isn't there a memrchr() ? */
+                    newline=&buf[linebuffer->size-1];
+                    while (fgetSize > 1) {
+                        if (*newline=='\0')
+                            break;
+                        newline--;fgetSize--;
+                    }
+                    *newline = '\n';
+                }
+            }
+        } else {
+            return NULL;
+        }
+        break;
     }
-  while (c != '\n');
+    linebuffer->length = newline - buf + 1;
+    return linebuffer;
+}
 
-  linebuffer->length = p - buffer;
-  return linebuffer;
+/* Read a long line of text from STREAM into LINEBUFFER.
+   Remove any newline & null terminate (so can't be \0's in stream),
+   Return LINEBUFFER, except at end of file or error return NULL.
+   Note the length is not returned in linebuffer->length but can
+   be easily determined using strlen(linebuffer->buffer) if required. */
+struct linebuffer *
+readlineNoNulls (struct linebuffer *linebuffer, FILE *stream)
+{
+    size_t fgetSize;
+    char* fgetPos;
+    char* newline;
+    char* fgetsRet;
+    char* buf;
+
+    fgetPos = buf = linebuffer->buffer;
+    fgetSize = linebuffer->size;
+    buf[0]='\0';
+
+    if (feof(stream) || ferror(stream))
+        return NULL;
+
+    while(1) {
+        buf[linebuffer->size-2]='\0'; //flag for buffer full
+        if ((fgetsRet=fgets(fgetPos, (int) fgetSize, stream))) {
+            newline=strchr(fgetPos, '\n');
+            if (newline)
+                *newline='\0';
+            else {
+                if (buf[linebuffer->size-2] != '\0') { //buffer possibly full
+                    if (!feof(stream)) {
+                        fgetSize = linebuffer->size + 1;
+                        linebuffer->size *= 2;
+                           buf = (char *) xrealloc (buf, linebuffer->size);
+                        linebuffer->buffer = buf;
+                        fgetPos = buf + fgetSize -2;
+                        continue;
+                    }
+                    //else small chance occurred that data matched buff size 
exactly
+                }
+            }
+        } else {
+            return NULL;
+        }
+        break;
+    }
+    return linebuffer;
 }
 
 /* Free linebuffer LINEBUFFER and its data, all allocated with malloc. */
diff -aru -x *.o textutils-2.0.15/lib/linebuffer.h 
textutils-2.0.15_pb/lib/linebuffer.h
--- textutils-2.0.15/lib/linebuffer.h   Mon Aug  7 16:48:18 2000
+++ textutils-2.0.15_pb/lib/linebuffer.h        Mon Sep 24 20:48:14 2001
@@ -40,10 +40,20 @@
 
 /* Read an arbitrarily long line of text from STREAM into LINEBUFFER.
    Keep the newline; append a newline if it's the last line of a file
-   that ends in a non-newline character.  Do not null terminate.
-   Return LINEBUFFER, except at end of file return 0.  */
+   that ends in a non-newline character. Do not null terminate.
+   Therefore the stream can contain NULLs and the length (including 
+   the newline) is returned in linebuffer->size.
+   Return LINEBUFFER, except at end of file or error return NULL.*/
 struct linebuffer *readline PARAMS ((struct linebuffer *linebuffer,
                                     FILE *stream));
+
+/* Read a long line of text from STREAM into LINEBUFFER.
+   Remove any newline & null terminate (so can't be \0's in stream),
+   Return LINEBUFFER, except at end of file or error return NULL.
+   Note the length is not returned in linebuffer->length but can
+   be easily determined using strlen(linebuffer->buffer) if required. */
+struct linebuffer *readlineNoNulls PARAMS ((struct linebuffer *linebuffer,
+                     FILE *stream));
 
 /* Free linebuffer LINEBUFFER and its data, all allocated with malloc. */
 void freebuffer PARAMS ((struct linebuffer *));

reply via email to

[Prev in Thread] Current Thread [Next in Thread]