bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 1/3] Tune single-byte code involving tolower


From: Paul Eggert
Subject: [PATCH 1/3] Tune single-byte code involving tolower
Date: Sat, 26 Aug 2023 23:34:20 -0700

* lib/mbmemcasecmp.c (mbmemcasecmp):
* lib/mbscasecmp.c (mbscasecmp):
* lib/mbscasestr.c (mbscasestr):
* lib/mbsncasecmp.c (mbsncasecmp):
* lib/mbspcasecmp.c (mbspcasecmp):
Avoid some unnecessary calls to tolower.  For example, if the two
single-byte characters are equal before downcasing there is no
need to call tolower on either character.
---
 ChangeLog          | 12 ++++++++++++
 lib/mbmemcasecmp.c | 46 +++++++++++++++++++++----------------------
 lib/mbscasecmp.c   | 49 +++++++++++++++++++++-------------------------
 lib/mbscasestr.c   | 13 +++++++-----
 lib/mbsncasecmp.c  | 45 +++++++++++++++++++++---------------------
 lib/mbspcasecmp.c  | 47 +++++++++++++++++++++-----------------------
 6 files changed, 109 insertions(+), 103 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ff09eed9ba..de52b57755 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2023-08-26  Paul Eggert  <eggert@cs.ucla.edu>
+
+       Tune single-byte code involving tolower
+       * lib/mbmemcasecmp.c (mbmemcasecmp):
+       * lib/mbscasecmp.c (mbscasecmp):
+       * lib/mbscasestr.c (mbscasestr):
+       * lib/mbsncasecmp.c (mbsncasecmp):
+       * lib/mbspcasecmp.c (mbspcasecmp):
+       Avoid some unnecessary calls to tolower.  For example, if the two
+       single-byte characters are equal before downcasing there is no
+       need to call tolower on either character.
+
 2023-08-26  Bruno Haible  <bruno@clisp.org>
 
        c32width tests: Avoid failure on FreeBSD 12.
diff --git a/lib/mbmemcasecmp.c b/lib/mbmemcasecmp.c
index f4a397033e..2bf4effa47 100644
--- a/lib/mbmemcasecmp.c
+++ b/lib/mbmemcasecmp.c
@@ -33,19 +33,18 @@ mbmemcasecmp (const char *s1, size_t n1, const char *s2, 
size_t n2)
   if (s1 == s2)
     return _GL_CMP (n1, n2);
 
+  const char *iter1 = s1;
+  const char *iter2 = s2;
+
   if (MB_CUR_MAX > 1)
     {
       const char *s1_end = s1 + n1;
       mbif_state_t state1;
-      const char *iter1;
       mbif_init (state1);
-      iter1 = s1;
 
       const char *s2_end = s2 + n2;
       mbif_state_t state2;
-      const char *iter2;
       mbif_init (state2);
-      iter2 = s2;
 
       while (mbif_avail (state1, iter1, s1_end)
              && mbif_avail (state2, iter2, s2_end))
@@ -70,32 +69,33 @@ mbmemcasecmp (const char *s1, size_t n1, const char *s2, 
size_t n2)
     }
   else
     {
-      const unsigned char *s1_end = (const unsigned char *) (s1 + n1);
-      const unsigned char *s2_end = (const unsigned char *) (s2 + n2);
-      const unsigned char *p1 = (const unsigned char *) s1;
-      const unsigned char *p2 = (const unsigned char *) s2;
+      const char *s1_end = s1 + n1;
+      const char *s2_end = s2 + n2;
 
-      while (p1 < s1_end && p2 < s2_end)
+      while (iter1 < s1_end && iter2 < s2_end)
         {
-          unsigned char c1 = tolower (*p1);
-          unsigned char c2 = tolower (*p2);
-          if (c1 != c2)
+          unsigned char c1 = *iter1++;
+          unsigned char c2 = *iter2++;
+          /* On machines where 'char' and 'int' are types of the same size,
+             the difference of two 'unsigned char' values - including
+             the sign bit - doesn't fit in an 'int'.  */
+          int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+          if (cmp != 0)
             {
-              if (UCHAR_MAX <= INT_MAX)
-                return c1 - c2;
-              else
-                /* On machines where 'char' and 'int' are types of the same
-                   size, the difference of two 'unsigned char' values
-                   - including the sign bit - doesn't fit in an 'int'.  */
-                return _GL_CMP (c1, c2);
+              c1 = tolower (c1);
+              if (c1 != c2)
+                {
+                  c2 = tolower (c2);
+                  cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+                  if (cmp != 0)
+                    return cmp;
+                }
             }
-          ++p1;
-          ++p2;
         }
-      if (p1 < s1_end)
+      if (iter1 < s1_end)
         /* s2 terminated before s1.  */
         return 1;
-      if (p2 < s2_end)
+      if (iter2 < s2_end)
         /* s1 terminated before s2.  */
         return -1;
       return 0;
diff --git a/lib/mbscasecmp.c b/lib/mbscasecmp.c
index 3a20cb7f3f..80dc18529d 100644
--- a/lib/mbscasecmp.c
+++ b/lib/mbscasecmp.c
@@ -37,20 +37,19 @@ mbscasecmp (const char *s1, const char *s2)
   if (s1 == s2)
     return 0;
 
+  const char *iter1 = s1;
+  const char *iter2 = s2;
+
   /* Be careful not to look at the entire extent of s1 or s2 until needed.
      This is useful because when two strings differ, the difference is
      most often already in the very few first characters.  */
   if (MB_CUR_MAX > 1)
     {
       mbuif_state_t state1;
-      const char *iter1;
       mbuif_init (state1);
-      iter1 = s1;
 
       mbuif_state_t state2;
-      const char *iter2;
       mbuif_init (state2);
-      iter2 = s2;
 
       while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2))
         {
@@ -73,30 +72,26 @@ mbscasecmp (const char *s1, const char *s2)
       return 0;
     }
   else
-    {
-      const unsigned char *p1 = (const unsigned char *) s1;
-      const unsigned char *p2 = (const unsigned char *) s2;
-      unsigned char c1, c2;
-
-      do
-        {
-          c1 = tolower (*p1);
-          c2 = tolower (*p2);
-
-          if (c1 == '\0')
-            break;
-
-          ++p1;
-          ++p2;
-        }
-      while (c1 == c2);
-
-      if (UCHAR_MAX <= INT_MAX)
-        return c1 - c2;
-      else
+    for (;;)
+      {
+        unsigned char c1 = *iter1++;
+        unsigned char c2 = *iter2++;
         /* On machines where 'char' and 'int' are types of the same size, the
            difference of two 'unsigned char' values - including the sign bit -
            doesn't fit in an 'int'.  */
-        return _GL_CMP (c1, c2);
-    }
+        int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+        if (cmp != 0)
+          {
+            c1 = tolower (c1);
+            if (c1 == c2)
+              cmp = 0;
+            else
+              {
+                c2 = tolower (c2);
+                cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+              }
+          }
+        if (cmp | !c1)
+          return cmp;
+      }
 }
diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c
index 6946fff21c..d2b439f1e1 100644
--- a/lib/mbscasestr.c
+++ b/lib/mbscasestr.c
@@ -337,8 +337,9 @@ mbscasestr (const char *haystack, const char *needle)
           const char *needle_last_ccount = needle; /* = needle + last_ccount */
 
           /* Speed up the following searches of needle by caching its first
-             character.  */
-          unsigned char b = tolower ((unsigned char) *needle);
+             character and lowercase counterpart.  */
+          unsigned char B = *needle;
+          unsigned char b = tolower (B);
 
           needle++;
           for (;; haystack++)
@@ -381,7 +382,8 @@ mbscasestr (const char *haystack, const char *needle)
 
               outer_loop_count++;
               comparison_count++;
-              if (tolower ((unsigned char) *haystack) == b)
+              unsigned char H = *haystack;
+              if (H == B || H == b || tolower (H) == b)
                 /* The first character matches.  */
                 {
                   const char *rhaystack = haystack + 1;
@@ -396,8 +398,9 @@ mbscasestr (const char *haystack, const char *needle)
                         /* No match.  */
                         return NULL;
                       comparison_count++;
-                      if (tolower ((unsigned char) *rhaystack)
-                          != tolower ((unsigned char) *rneedle))
+                      if (! (*rhaystack == *rneedle
+                             || (tolower ((unsigned char) *rhaystack)
+                                 == tolower ((unsigned char) *rneedle))))
                         /* Nothing in this round.  */
                         break;
                     }
diff --git a/lib/mbsncasecmp.c b/lib/mbsncasecmp.c
index 0b7027d91d..8ee1df7d4d 100644
--- a/lib/mbsncasecmp.c
+++ b/lib/mbsncasecmp.c
@@ -39,20 +39,19 @@ mbsncasecmp (const char *s1, const char *s2, size_t n)
   if (s1 == s2 || n == 0)
     return 0;
 
+  const char *iter1 = s1;
+  const char *iter2 = s2;
+
   /* Be careful not to look at the entire extent of s1 or s2 until needed.
      This is useful because when two strings differ, the difference is
      most often already in the very few first characters.  */
   if (MB_CUR_MAX > 1)
     {
       mbuif_state_t state1;
-      const char *iter1;
       mbuif_init (state1);
-      iter1 = s1;
 
       mbuif_state_t state2;
-      const char *iter2;
       mbuif_init (state2);
-      iter2 = s2;
 
       while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2))
         {
@@ -78,26 +77,26 @@ mbsncasecmp (const char *s1, const char *s2, size_t n)
       return 0;
     }
   else
-    {
-      const unsigned char *p1 = (const unsigned char *) s1;
-      const unsigned char *p2 = (const unsigned char *) s2;
-      unsigned char c1, c2;
-
-      for (; ; p1++, p2++)
-        {
-          c1 = tolower (*p1);
-          c2 = tolower (*p2);
-
-          if (--n == 0 || c1 == '\0' || c1 != c2)
-            break;
-        }
-
-      if (UCHAR_MAX <= INT_MAX)
-        return c1 - c2;
-      else
+    for (;;)
+      {
+        unsigned char c1 = *iter1++;
+        unsigned char c2 = *iter2++;
         /* On machines where 'char' and 'int' are types of the same size, the
            difference of two 'unsigned char' values - including the sign bit -
            doesn't fit in an 'int'.  */
-        return _GL_CMP (c1, c2);
-    }
+        int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+        if (cmp != 0)
+          {
+            c1 = tolower (c1);
+            if (c1 == c2)
+              cmp = 0;
+            else
+              {
+                c2 = tolower (c2);
+                cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+              }
+          }
+        if (cmp | !c1 | !--n)
+          return cmp;
+      }
 }
diff --git a/lib/mbspcasecmp.c b/lib/mbspcasecmp.c
index daec2ffda3..090d12531b 100644
--- a/lib/mbspcasecmp.c
+++ b/lib/mbspcasecmp.c
@@ -39,20 +39,19 @@ mbspcasecmp (const char *string, const char *prefix)
   if (string == prefix)
     return (char *) (string + strlen (string));
 
+  const char *iter1 = string;
+  const char *iter2 = prefix;
+
   /* Be careful not to look at the entire extent of STRING or PREFIX until
      needed.  This is useful because when two strings differ, the difference is
      most often already in the very few first characters.  */
   if (MB_CUR_MAX > 1)
     {
       mbuif_state_t state1;
-      const char *iter1;
       mbuif_init (state1);
-      iter1 = string;
 
       mbuif_state_t state2;
-      const char *iter2;
       mbuif_init (state2);
-      iter2 = prefix;
 
       while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2))
         {
@@ -74,25 +73,23 @@ mbspcasecmp (const char *string, const char *prefix)
         return NULL;
     }
   else
-    {
-      const unsigned char *p1 = (const unsigned char *) string;
-      const unsigned char *p2 = (const unsigned char *) prefix;
-      unsigned char c1, c2;
-
-      for (; ; p1++, p2++)
-        {
-          c1 = tolower (*p1);
-          c2 = tolower (*p2);
-
-          if (c2 == '\0' || c1 != c2)
-            break;
-        }
-
-      if (c2 == '\0')
-        /* PREFIX equals STRING or is terminated before STRING.  */
-        return (char *) p1;
-      else
-        /* STRING terminated before PREFIX.  */
-        return NULL;
-    }
+    for (;; iter1++, iter2++)
+      {
+        unsigned char c2 = *iter2;
+
+        if (c2 == '\0')
+          /* PREFIX equals STRING or is terminated before STRING.  */
+          return (char *) iter1;
+
+        unsigned char c1 = *iter1;
+
+        if (c1 != c2)
+          {
+            c1 = tolower (c1);
+            if (c1 != c2 && c1 != tolower (c2))
+              /* STRING and PREFIX disagree,
+                 or STRING terminated before PREFIX.  */
+              return NULL;
+          }
+      }
 }
-- 
2.41.0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]