coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[coreutils] [PATCH] sort: -R now uses less memory on long lines with int


From: Paul Eggert
Subject: [coreutils] [PATCH] sort: -R now uses less memory on long lines with internal NULs
Date: Wed, 04 Aug 2010 16:12:56 -0700
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.11) Gecko/20100713 Thunderbird/3.0.6

* lib/Makefile.am (libcoreutils_a_SOURCES): Remove xmemxfrm.c,
xmemxfrm.h.
* lib/memxfrm.c, lib/memxfrm.h, lib/xmemxfrm.c, lib/xmemxfrm.h: Remove.
* m4/memxfrm.m4: Likewise.
* m4/prereq.m4 (gl_PREREQ): Remove gl_MEMXFRM.
* po/POTFILES.in: Remove lib/xmemxfrm.c.
* src/sort.c: Don't include xmemxfrm.h.
(cmp_hashes): Remove.
(xstrxfrm): New function.
(compare_random): If a line contains NULs, don't create a big
buffer that contains the strxfrm output of each string in the line.
Instead, accumulate checksums and differences as we go, so that
at any one time we have to store at most the output of a single
strxfrm call when processing the line.  This removes the need for
an memxfrm function.
---
 lib/Makefile.am |    3 +-
 lib/memxfrm.c   |  105 ----------------------------------
 lib/memxfrm.h   |    2 -
 lib/xmemxfrm.c  |   62 --------------------
 lib/xmemxfrm.h  |    2 -
 m4/memxfrm.m4   |   15 -----
 m4/prereq.m4    |    3 +-
 po/POTFILES.in  |    1 -
 src/sort.c      |  167 ++++++++++++++++++++++++++++++++++++++-----------------
 9 files changed, 118 insertions(+), 242 deletions(-)
 delete mode 100644 lib/memxfrm.c
 delete mode 100644 lib/memxfrm.h
 delete mode 100644 lib/xmemxfrm.c
 delete mode 100644 lib/xmemxfrm.h
 delete mode 100644 m4/memxfrm.m4

diff --git a/lib/Makefile.am b/lib/Makefile.am
index c89ef75..b4a591b 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -20,8 +20,7 @@ include gnulib.mk
 AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS)
 
 libcoreutils_a_SOURCES += \
-  buffer-lcm.c buffer-lcm.h \
-  xmemxfrm.c xmemxfrm.h
+  buffer-lcm.c buffer-lcm.h
 
 libcoreutils_a_LIBADD += $(LIBOBJS)
 libcoreutils_a_DEPENDENCIES += $(LIBOBJS)
diff --git a/lib/memxfrm.c b/lib/memxfrm.c
deleted file mode 100644
index 12a1ae9..0000000
--- a/lib/memxfrm.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Locale-specific memory transformation
-
-   Copyright (C) 2006, 2009-2010 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
-
-/* Written by Paul Eggert <address@hidden>.  */
-
-#include <config.h>
-
-#include "memxfrm.h"
-
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* Store into DEST (of size DESTSIZE) the text in SRC (of size SRCSIZE)
-   transformed so that the result of memcmp on two transformed texts
-   (with ties going to the longer text) is the same as the result of
-   memcoll on the two texts before their transformation.  Perhaps
-   temporarily modify the byte after SRC, but restore its original
-   contents before returning.
-
-   Return the size of the resulting text, or an indeterminate value if
-   there is an error.  Set errno to an error number if there is an
-   error, and to zero otherwise.  DEST contains an indeterminate value
-   if there is an error or if the resulting size is greater than
-   DESTSIZE.  */
-
-size_t
-memxfrm (char *restrict dest, size_t destsize,
-         char *restrict src, size_t srcsize)
-{
-#if HAVE_STRXFRM
-
-  size_t di = 0;
-  size_t si = 0;
-  size_t result = 0;
-
-  char ch = src[srcsize];
-  src[srcsize] = '\0';
-
-  while (si < srcsize)
-    {
-      size_t slen = strlen (src + si);
-
-      size_t result0 = result;
-      errno = 0;
-      result += strxfrm (dest + di, src + si, destsize - di) + 1;
-      if (errno != 0)
-        break;
-      if (result <= result0)
-        {
-          errno = ERANGE;
-          break;
-        }
-
-      if (result == destsize + 1 && si + slen == srcsize)
-        {
-          /* The destination is exactly the right size, but strxfrm wants
-             room for a trailing null.  Work around the problem with a
-             temporary buffer.  */
-          size_t bufsize = destsize - di + 1;
-          char stackbuf[4000];
-          char *buf = stackbuf;
-          if (sizeof stackbuf < bufsize)
-            {
-              buf = malloc (bufsize);
-              if (! buf)
-                break;
-            }
-          strxfrm (buf, src + si, bufsize);
-          memcpy (dest + di, buf, destsize - di);
-          if (sizeof stackbuf < bufsize)
-            free (buf);
-          errno = 0;
-        }
-
-      di = (result < destsize ? result : destsize);
-      si += slen + 1;
-    }
-
-  src[srcsize] = ch;
-  return result - (si != srcsize);
-
-#else
-
-  if (srcsize < destsize)
-    memcpy (dest, src, srcsize);
-  errno = 0;
-  return srcsize;
-
-#endif
-}
diff --git a/lib/memxfrm.h b/lib/memxfrm.h
deleted file mode 100644
index 605421d..0000000
--- a/lib/memxfrm.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <stddef.h>
-size_t memxfrm (char *restrict, size_t, char *restrict, size_t);
diff --git a/lib/xmemxfrm.c b/lib/xmemxfrm.c
deleted file mode 100644
index 9cdda98..0000000
--- a/lib/xmemxfrm.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Locale-specific memory transformation
-
-   Copyright (C) 2006, 2009-2010 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
-
-/* Written by Paul Eggert <address@hidden>.  */
-
-#include <config.h>
-
-#include "xmemxfrm.h"
-
-#include <errno.h>
-#include <stdlib.h>
-
-#include "gettext.h"
-#define _(msgid) gettext (msgid)
-
-#include "error.h"
-#include "exitfail.h"
-#include "memxfrm.h"
-#include "quotearg.h"
-
-/* Store into DEST (of size DESTSIZE) the text in SRC (of size
-   SRCSIZE) transformed so that the result of memcmp on two
-   transformed texts (with ties going to the longer text) is the same
-   as the result of memcoll on the two texts before their
-   transformation.  Perhaps temporarily modify the byte after SRC, but
-   restore its original contents before returning.
-
-   Return the size of the resulting text.  DEST contains an
-   indeterminate value if the resulting size is greater than DESTSIZE.
-   Report an error and exit if there is an error.  */
-
-size_t
-xmemxfrm (char *restrict dest, size_t destsize,
-          char *restrict src, size_t srcsize)
-{
-  size_t translated_size = memxfrm (dest, destsize, src, srcsize);
-
-  if (errno)
-    {
-      error (0, errno, _("string transformation failed"));
-      error (0, 0, _("set LC_ALL='C' to work around the problem"));
-      error (exit_failure, 0,
-             _("the untransformed string was %s"),
-             quotearg_n_style_mem (0, locale_quoting_style, src, srcsize));
-    }
-
-  return translated_size;
-}
diff --git a/lib/xmemxfrm.h b/lib/xmemxfrm.h
deleted file mode 100644
index 7c4b8ad..0000000
--- a/lib/xmemxfrm.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <stddef.h>
-size_t xmemxfrm (char *restrict, size_t, char *restrict, size_t);
diff --git a/m4/memxfrm.m4 b/m4/memxfrm.m4
deleted file mode 100644
index 47a17d3..0000000
--- a/m4/memxfrm.m4
+++ /dev/null
@@ -1,15 +0,0 @@
-dnl Copyright (C) 2006, 2009-2010 Free Software Foundation, Inc.
-dnl This file is free software; the Free Software Foundation
-dnl gives unlimited permission to copy and/or distribute it,
-dnl with or without modifications, as long as this notice is preserved.
-
-AC_DEFUN([gl_MEMXFRM],
-[
-  AC_LIBSOURCES([memxfrm.c, memxfrm.h])
-  AC_LIBOBJ([memxfrm])
-
-  AC_REQUIRE([AC_C_RESTRICT])
-
-  dnl Prerequisites of lib/memcoll.c.
-  AC_CHECK_FUNCS_ONCE([strxfrm])
-])
diff --git a/m4/prereq.m4 b/m4/prereq.m4
index 24478cb..8caea38 100644
--- a/m4/prereq.m4
+++ b/m4/prereq.m4
@@ -1,4 +1,4 @@
-#serial 76
+#serial 77
 
 dnl We use gl_ for non Autoconf macros.
 m4_pattern_forbid([^gl_[ABCDEFGHIJKLMNOPQRSTUVXYZ]])dnl
@@ -40,7 +40,6 @@ AC_DEFUN([gl_PREREQ],
   AC_REQUIRE([gl_FD_REOPEN])
   AC_REQUIRE([gl_FUNC_XATTR])
   AC_REQUIRE([gl_FUNC_XFTS])
-  AC_REQUIRE([gl_MEMXFRM])
   AC_REQUIRE([gl_STRINTCMP])
   AC_REQUIRE([gl_STRNUMCMP])
 ])
diff --git a/po/POTFILES.in b/po/POTFILES.in
index c862877..be20f4c 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -29,7 +29,6 @@ lib/version-etc.c
 lib/xalloc-die.c
 lib/xfreopen.c
 lib/xmemcoll.c
-lib/xmemxfrm.c
 lib/xprintf.c
 lib/xstrtol-error.c
 
diff --git a/src/sort.c b/src/sort.c
index 1df711d..5438263 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -48,7 +48,6 @@
 #include "stdlib--.h"
 #include "strnumcmp.h"
 #include "xmemcoll.h"
-#include "xmemxfrm.h"
 #include "xnanosleep.h"
 #include "xstrtol.h"
 
@@ -2001,34 +2000,24 @@ random_md5_state_init (char const *random_source)
   md5_process_bytes (buf, sizeof buf, &random_md5_state);
 }
 
-/* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
-   with length LENGTHB.  Return negative if less, zero if equal,
-   positive if greater.  */
+/* This is like strxfrm, except it reports any error and exits.  */
 
-static int
-cmp_hashes (char const *texta, size_t lena,
-            char const *textb, size_t lenb)
+static size_t
+xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
 {
-  /* Try random hashes until a pair of hashes disagree.  But if the
-     first pair of random hashes agree, check whether the keys are
-     identical and if so report no difference.  */
-  int diff;
-  uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
-  struct md5_ctx s[2];
-  s[0] = s[1] = random_md5_state;
-  md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
-  md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
-  diff = memcmp (dig[0], dig[1], sizeof dig[0]);
-  if (! diff)
+  errno = 0;
+  size_t translated_size = strxfrm (dest, src, destsize);
+
+  if (errno)
     {
-      /* Break ties with memcmp.  This is good enough given the
-         astronomically low probability of an MD5 collision.  */
-      diff = memcmp (texta, textb, MIN (lena, lenb));
-      if (! diff)
-        diff = lena < lenb ? -1 : lena != lenb;
+      error (0, errno, _("string transformation failed"));
+      error (0, 0, _("set LC_ALL='C' to work around the problem"));
+      error (SORT_FAILURE, 0,
+             _("the untransformed string was %s"),
+             quotearg_n_style (0, locale_quoting_style, src));
     }
 
-  return diff;
+  return translated_size;
 }
 
 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
@@ -2038,41 +2027,117 @@ static int
 compare_random (char *restrict texta, size_t lena,
                 char *restrict textb, size_t lenb)
 {
-  int diff;
+  /* XFRM_DIFF records the equivalent of memcmp on the transformed
+     data.  This is used to break ties if there is an checksum
+     collision, and this is good enough given the astronomically low
+     probability of a collision.  */
+  int xfrm_diff = 0;
+
+  char stackbuf[4000];
+  char *buf = stackbuf;
+  size_t bufsize = sizeof stackbuf;
+  uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
+  struct md5_ctx s[2];
+  s[0] = s[1] = random_md5_state;
 
-  if (! hard_LC_COLLATE)
-    diff = cmp_hashes (texta, lena, textb, lenb);
-  else
+  if (hard_LC_COLLATE)
     {
-      /* Transform the text into the basis of comparison, so that byte
-         strings that would otherwise considered to be equal are
-         considered equal here even if their bytes differ.  */
-
-      char *buf = NULL;
-      char stackbuf[4000];
-      size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
-      bool a_fits = tlena <= sizeof stackbuf;
-      size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
-                               (a_fits ? sizeof stackbuf - tlena : 0),
-                               textb, lenb);
-
-      if (a_fits && tlena + tlenb <= sizeof stackbuf)
-        buf = stackbuf;
-      else
+      /* Null-terminate the keys so that strxfrm knows where to stop.  */
+      char *lima = texta + lena; char enda = *lima; *lima = '\0';
+      char *limb = textb + lenb; char endb = *limb; *limb = '\0';
+
+      while (true)
         {
-          /* Adding 1 to the buffer size lets xmemxfrm run a bit
-             faster by avoiding the need for an extra buffer copy.  */
-          buf = xmalloc (tlena + tlenb + 1);
-          xmemxfrm (buf, tlena + 1, texta, lena);
-          xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
+          /* Transform the text into the basis of comparison, so that byte
+             strings that would otherwise considered to be equal are
+             considered equal here even if their bytes differ.
+
+             Each time through this loop, transform one
+             null-terminated string's worth from TEXTA or from TEXTB
+             or both.  That way, there's no need to store the
+             transformation of the whole line, if it contains many
+             null-terminated strings.  */
+
+          /* Store the transformed data into a big-enough buffer.  */
+
+          size_t sizea =
+            (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
+          bool a_fits = sizea <= bufsize;
+          size_t sizeb =
+            (textb < limb
+             ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
+                          (a_fits ? bufsize - sizea : 0))
+                + 1)
+             : 0);
+
+          if (! (a_fits && sizea + sizeb <= bufsize))
+            {
+              bufsize = sizea + sizeb;
+              if (bufsize < SIZE_MAX / 3)
+                bufsize = bufsize * 3 / 2;
+              buf = (buf == stackbuf
+                     ? xmalloc (bufsize)
+                     : xrealloc (buf, bufsize));
+              if (texta < lima)
+                strxfrm (buf, texta, sizea);
+              if (textb < limb)
+                strxfrm (buf + sizea, textb, sizeb);
+            }
+
+          /* Advance past NULs to the next part of each input string,
+             exiting the loop if both strings are exhausted.  When
+             exiting the loop, prepare to finish off the tiebreaker
+             comparison properly.  */
+          if (texta < lima)
+            texta += strlen (texta) + 1;
+          if (textb < limb)
+            textb += strlen (textb) + 1;
+          if (! (texta < lima || textb < limb))
+            {
+              lena = sizea; texta = buf;
+              lenb = sizeb; textb = buf + sizea;
+              break;
+            }
+
+          /* Accumulate the transformed data in the corresponding
+             checksums.  */
+          md5_process_bytes (buf, sizea, &s[0]);
+          md5_process_bytes (buf + sizea, sizeb, &s[1]);
+
+          /* Update the tiebreaker comparison of the transformed data.  */
+          if (! xfrm_diff)
+            {
+              xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
+              if (! xfrm_diff)
+                xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
+            }
         }
 
-      diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
+      *lima = enda;
+      *limb = endb;
+    }
+
+  /* Compute and compare the checksums.  */
+  md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
+  md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
+  int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
+
+  /* Fall back on the tiebreaker if the checksums collide.  */
+  if (! diff)
+    {
+      if (! xfrm_diff)
+        {
+          xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
+          if (! xfrm_diff)
+            xfrm_diff = (lena > lenb) - (lena < lenb);
+        }
 
-      if (buf != stackbuf)
-        free (buf);
+      diff = xfrm_diff;
     }
 
+  if (buf != stackbuf)
+    free (buf);
+
   return diff;
 }
 
-- 
1.7.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]