emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master 69b68099ec: Simplify and speed up string-to-multibyte


From: Mattias Engdegård
Subject: master 69b68099ec: Simplify and speed up string-to-multibyte
Date: Mon, 11 Jul 2022 05:08:18 -0400 (EDT)

branch: master
commit 69b68099ecfb053ac77e0a954ab7467c440321ff
Author: Mattias Engdegård <mattiase@acm.org>
Commit: Mattias Engdegård <mattiase@acm.org>

    Simplify and speed up string-to-multibyte
    
    * src/character.h (str_to_multibyte):
    * src/character.c (str_to_multibyte): Change signature and simplify;
    the conversion is no longer done in-place.
    * src/fns.c (string_to_multibyte): Drop temporary buffer and memcpy;
    adapt to new str_to_multibyte signature.
    * src/print.c (print_string): Drop memcpy; adapt call to str_to_multibyte.
    * test/src/fns-tests.el (fns--string-to-unibyte): Rename to...
    (fns--string-to-unibyte-multibyte): ... this and strengthen, so that
    the test covers string-to-multibyte reasonably well.
---
 src/character.c       | 43 +++++++++++++++++--------------------------
 src/character.h       |  3 ++-
 src/fns.c             | 23 +++++++----------------
 src/print.c           |  3 +--
 test/src/fns-tests.el | 14 ++++++++++----
 5 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/src/character.c b/src/character.c
index d12df23f8e..841e46c091 100644
--- a/src/character.c
+++ b/src/character.c
@@ -666,35 +666,26 @@ count_size_as_multibyte (const unsigned char *str, 
ptrdiff_t len)
 }
 
 
-/* Convert unibyte text at STR of BYTES bytes to a multibyte text
-   that contains the same single-byte characters.  It actually
-   converts all 8-bit characters to multibyte forms.  It is assured
-   that we can use LEN bytes at STR as a work area and that is
-   enough.  */
-
-ptrdiff_t
-str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
+/* Convert unibyte text at SRC of NCHARS bytes to a multibyte text
+   at DST of NBYTES bytes, that contains the same single-byte characters.  */
+void
+str_to_multibyte (unsigned char *dst, const unsigned char *src,
+                 ptrdiff_t nchars, ptrdiff_t nbytes)
 {
-  unsigned char *p = str, *endp = str + bytes;
-  unsigned char *to;
-
-  while (p < endp && *p < 0x80) p++;
-  if (p == endp)
-    return bytes;
-  to = p;
-  bytes = endp - p;
-  endp = str + len;
-  memmove (endp - bytes, p, bytes);
-  p = endp - bytes;
-  while (p < endp)
+  const unsigned char *s = src + nchars;
+  unsigned char *d = dst + nbytes;
+  for (ptrdiff_t i = 0; i < nchars; i++)
     {
-      int c = *p++;
-
-      if (c >= 0x80)
-       c = BYTE8_TO_CHAR (c);
-      to += CHAR_STRING (c, to);
+      unsigned char c = *--s;
+      if (c <= 0x7f)
+       *--d = c;
+      else
+       {
+         *--d = 0x80 + (c & 0x3f);
+         *--d = 0xc0 + ((c >> 6) & 1);
+       }
     }
-  return (to - str);
+  eassert (d == dst && s == src);
 }
 
 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
diff --git a/src/character.h b/src/character.h
index 2ca935ba04..36e2b06ee1 100644
--- a/src/character.h
+++ b/src/character.h
@@ -567,7 +567,8 @@ extern int translate_char (Lisp_Object, int c);
 extern ptrdiff_t count_size_as_multibyte (const unsigned char *, ptrdiff_t);
 extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t,
                                   ptrdiff_t *);
-extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
+extern void str_to_multibyte (unsigned char *dst, const unsigned char *src,
+                             ptrdiff_t nchars, ptrdiff_t nbytes);
 extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
 extern ptrdiff_t strwidth (const char *, ptrdiff_t);
 extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,
diff --git a/src/fns.c b/src/fns.c
index 61ed01eee4..7d8f957ef9 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -1237,33 +1237,24 @@ string_make_multibyte (Lisp_Object string)
 
 
 /* Convert STRING (if unibyte) to a multibyte string without changing
-   the number of characters.  Characters 0200 through 0237 are
-   converted to eight-bit characters. */
+   the number of characters.  Characters 0x80..0xff are interpreted as
+   raw bytes. */
 
 Lisp_Object
 string_to_multibyte (Lisp_Object string)
 {
-  unsigned char *buf;
-  ptrdiff_t nbytes;
-  Lisp_Object ret;
-  USE_SAFE_ALLOCA;
-
   if (STRING_MULTIBYTE (string))
     return string;
 
-  nbytes = count_size_as_multibyte (SDATA (string), SBYTES (string));
+  ptrdiff_t nchars = SCHARS (string);
+  ptrdiff_t nbytes = count_size_as_multibyte (SDATA (string), nchars);
   /* If all the chars are ASCII, they won't need any more bytes once
      converted.  */
-  if (nbytes == SBYTES (string))
+  if (nbytes == nchars)
     return make_multibyte_string (SSDATA (string), nbytes, nbytes);
 
-  buf = SAFE_ALLOCA (nbytes);
-  memcpy (buf, SDATA (string), SBYTES (string));
-  str_to_multibyte (buf, nbytes, SBYTES (string));
-
-  ret = make_multibyte_string ((char *) buf, SCHARS (string), nbytes);
-  SAFE_FREE ();
-
+  Lisp_Object ret = make_uninit_multibyte_string (nchars, nbytes);
+  str_to_multibyte (SDATA (ret), SDATA (string), nchars, nbytes);
   return ret;
 }
 
diff --git a/src/print.c b/src/print.c
index 4d7e42df1e..9a31e386f5 100644
--- a/src/print.c
+++ b/src/print.c
@@ -467,8 +467,7 @@ print_string (Lisp_Object string, Lisp_Object printcharfun)
          if (chars < bytes)
            {
              newstr = make_uninit_multibyte_string (chars, bytes);
-             memcpy (SDATA (newstr), SDATA (string), chars);
-             str_to_multibyte (SDATA (newstr), bytes, chars);
+             str_to_multibyte (SDATA (newstr), SDATA (string), chars, bytes);
              string = newstr;
            }
        }
diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el
index 0119e31df1..20074ca0d2 100644
--- a/test/src/fns-tests.el
+++ b/test/src/fns-tests.el
@@ -1344,18 +1344,24 @@
     (should (equal (plist-member plist (copy-sequence "a") #'equal)
                    '("a" "c")))))
 
-(ert-deftest fns--string-to-unibyte ()
-  (dolist (str '("" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz ""\x80\xdd\xff"))
+(ert-deftest fns--string-to-unibyte-multibyte ()
+  (dolist (str (list "" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz" "\x80\xdd\xff"
+                     (apply #'unibyte-string (number-sequence 0 255))))
     (ert-info ((prin1-to-string str) :prefix "str: ")
       (should-not (multibyte-string-p str))
       (let* ((u (string-to-unibyte str))   ; should be identity
              (m (string-to-multibyte u))   ; lossless conversion
-             (uu (string-to-unibyte m)))   ; also lossless
+             (mm (string-to-multibyte m))  ; should be identity
+             (uu (string-to-unibyte m))    ; also lossless
+             (ml (mapcar (lambda (c) (if (<= c #x7f) c (+ c #x3fff00))) u)))
         (should-not (multibyte-string-p u))
         (should (multibyte-string-p m))
+        (should (multibyte-string-p mm))
         (should-not (multibyte-string-p uu))
         (should (equal str u))
-        (should (equal str uu)))))
+        (should (equal m mm))
+        (should (equal str uu))
+        (should (equal (append m nil) ml)))))
   (should-error (string-to-unibyte "å"))
   (should-error (string-to-unibyte "ABC∀BC")))
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]