emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master cfda663282: Speed up string-to-unibyte


From: Mattias Engdegård
Subject: master cfda663282: Speed up string-to-unibyte
Date: Sun, 10 Jul 2022 12:21:28 -0400 (EDT)

branch: master
commit cfda663282b788972c344e6733a8aa60a3e0f545
Author: Mattias Engdegård <mattiase@acm.org>
Commit: Mattias Engdegård <mattiase@acm.org>

    Speed up string-to-unibyte
    
    * src/character.h (str_to_unibyte):
    * src/character.c (str_to_unibyte): Remove.
    * src/fns.c (Fstring_to_unibyte): Ditch the call to str_to_unibyte and
    the unnecessary heap allocation.  Write new, faster code.
    * test/src/fns-tests.el (fns--string-to-unibyte): New test.
---
 src/character.c       | 25 -------------------------
 src/character.h       |  2 --
 src/fns.c             | 25 +++++++++++++++----------
 test/src/fns-tests.el | 15 +++++++++++++++
 4 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/src/character.c b/src/character.c
index c1a1b55389..d12df23f8e 100644
--- a/src/character.c
+++ b/src/character.c
@@ -734,31 +734,6 @@ str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
   return (to - str);
 }
 
-/* Convert eight-bit chars in SRC (in multibyte form) to the
-   corresponding byte and store in DST.  CHARS is the number of
-   characters in SRC.  The value is the number of bytes stored in DST.
-   Usually, the value is the same as CHARS, but is less than it if SRC
-   contains a non-ASCII, non-eight-bit character.  */
-
-ptrdiff_t
-str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
-{
-  ptrdiff_t i;
-
-  for (i = 0; i < chars; i++)
-    {
-      int c = string_char_advance (&src);
-
-      if (CHAR_BYTE8_P (c))
-       c = CHAR_TO_BYTE8 (c);
-      else if (! ASCII_CHAR_P (c))
-       return i;
-      *dst++ = c;
-    }
-  return i;
-}
-
-
 static ptrdiff_t
 string_count_byte8 (Lisp_Object string)
 {
diff --git a/src/character.h b/src/character.h
index 6ee6bcab20..2ca935ba04 100644
--- a/src/character.h
+++ b/src/character.h
@@ -569,8 +569,6 @@ extern ptrdiff_t str_as_multibyte (unsigned char *, 
ptrdiff_t, ptrdiff_t,
                                   ptrdiff_t *);
 extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
 extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
-extern ptrdiff_t str_to_unibyte (const unsigned char *, unsigned char *,
-                                 ptrdiff_t);
 extern ptrdiff_t strwidth (const char *, ptrdiff_t);
 extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,
                                 ptrdiff_t *, ptrdiff_t *);
diff --git a/src/fns.c b/src/fns.c
index 49d76a0e7c..61ed01eee4 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -1413,19 +1413,24 @@ an error is signaled.  */)
   (Lisp_Object string)
 {
   CHECK_STRING (string);
+  if (!STRING_MULTIBYTE (string))
+    return string;
 
-  if (STRING_MULTIBYTE (string))
+  ptrdiff_t chars = SCHARS (string);
+  Lisp_Object ret = make_uninit_string (chars);
+  unsigned char *src = SDATA (string);
+  unsigned char *dst = SDATA (ret);
+  for (ptrdiff_t i = 0; i < chars; i++)
     {
-      ptrdiff_t chars = SCHARS (string);
-      unsigned char *str = xmalloc (chars);
-      ptrdiff_t converted = str_to_unibyte (SDATA (string), str, chars);
-
-      if (converted < chars)
-       error ("Can't convert the %"pD"dth character to unibyte", converted);
-      string = make_unibyte_string ((char *) str, chars);
-      xfree (str);
+      unsigned char b = *src++;
+      if (b <= 0x7f)
+       *dst++ = b;                                      /* ASCII */
+      else if (CHAR_BYTE8_HEAD_P (b))
+       *dst++ = 0x80 | (b & 1) << 6 | (*src++ & 0x3f);  /* raw byte */
+      else
+       error ("Cannot convert character at index %"pD"d to unibyte", i);
     }
-  return string;
+  return ret;
 }
 
 
diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el
index ba56019d4c..0119e31df1 100644
--- a/test/src/fns-tests.el
+++ b/test/src/fns-tests.el
@@ -1344,4 +1344,19 @@
     (should (equal (plist-member plist (copy-sequence "a") #'equal)
                    '("a" "c")))))
 
+(ert-deftest fns--string-to-unibyte ()
+  (dolist (str '("" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz ""\x80\xdd\xff"))
+    (ert-info ((prin1-to-string str) :prefix "str: ")
+      (should-not (multibyte-string-p str))
+      (let* ((u (string-to-unibyte str))   ; should be identity
+             (m (string-to-multibyte u))   ; lossless conversion
+             (uu (string-to-unibyte m)))   ; also lossless
+        (should-not (multibyte-string-p u))
+        (should (multibyte-string-p m))
+        (should-not (multibyte-string-p uu))
+        (should (equal str u))
+        (should (equal str uu)))))
+  (should-error (string-to-unibyte "å"))
+  (should-error (string-to-unibyte "ABC∀BC")))
+
 ;;; fns-tests.el ends here



reply via email to

[Prev in Thread] Current Thread [Next in Thread]