master c88a3be: Fix string-to-multibyte overlong sequence bug

emacs-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
master c88a3be: Fix string-to-multibyte overlong sequence bug

From:	Paul Eggert
Subject:	master c88a3be: Fix string-to-multibyte overlong sequence bug
Date:	Tue, 21 Apr 2020 01:31:17 -0400 (EDT)
branch: master
commit c88a3be8087ad0165415aa87c01f868a7433cb21
Author: Paul Eggert <address@hidden>
Commit: Paul Eggert <address@hidden>

    Fix string-to-multibyte overlong sequence bug
    
    * src/character.h (MULTIBYTE_LENGTH, MULTIBYTE_LENGTH_NO_CHECK):
    Remove, replacing with ...
    (multibyte_length): ... this new function.  All callers changed.
    The new function rejects overlong multibyte forms.
    * test/src/buffer-tests.el (buffer-multibyte-overlong-sequences):
    New test.
---
 src/buffer.c             |  3 +-
 src/character.c          | 42 +++++++++++----------
 src/character.h          | 96 +++++++++++++++++++++++++++---------------------
 src/coding.c             | 14 ++++---
 test/src/buffer-tests.el | 14 +++++++
 5 files changed, 101 insertions(+), 68 deletions(-)

diff --git a/src/buffer.c b/src/buffer.c
index 5398414..53b3bd9 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -2634,8 +2634,7 @@ current buffer is cleared.  */)
          if (ASCII_CHAR_P (*p))
            p++, pos++;
          else if (EQ (flag, Qt)
-                  && ! CHAR_BYTE8_HEAD_P (*p)
-                  && (bytes = MULTIBYTE_LENGTH (p, pend)) > 0)
+                  && 0 < (bytes = multibyte_length (p, pend, true, false)))
            p += bytes, pos += bytes;
          else
            {
diff --git a/src/character.c b/src/character.c
index 303c83c..da09e77 100644
--- a/src/character.c
+++ b/src/character.c
@@ -486,7 +486,7 @@ multibyte_chars_in_text (const unsigned char *ptr, 
ptrdiff_t nbytes)
 
   while (ptr < endp)
     {
-      int len = MULTIBYTE_LENGTH (ptr, endp);
+      int len = multibyte_length (ptr, endp, true, true);
 
       if (len == 0)
        emacs_abort ();
@@ -508,7 +508,6 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t 
len,
                        ptrdiff_t *nchars, ptrdiff_t *nbytes)
 {
   const unsigned char *endp = str + len;
-  int n;
   ptrdiff_t chars = 0, bytes = 0;
 
   if (len >= MAX_MULTIBYTE_LENGTH)
@@ -516,8 +515,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t 
len,
       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
       while (str < adjusted_endp)
        {
-         if (! CHAR_BYTE8_HEAD_P (*str)
-             && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
+         int n = multibyte_length (str, NULL, false, false);
+         if (0 < n)
            str += n, bytes += n;
          else
            str++, bytes += 2;
@@ -526,8 +525,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t 
len,
     }
   while (str < endp)
     {
-      if (! CHAR_BYTE8_HEAD_P (*str)
-         && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
+      int n = multibyte_length (str, endp, true, false);
+      if (0 < n)
        str += n, bytes += n;
       else
        str++, bytes += 2;
@@ -554,20 +553,25 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, 
ptrdiff_t nbytes,
   unsigned char *p = str, *endp = str + nbytes;
   unsigned char *to;
   ptrdiff_t chars = 0;
-  int n;
 
   if (nbytes >= MAX_MULTIBYTE_LENGTH)
     {
       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
-      while (p < adjusted_endp
-            && ! CHAR_BYTE8_HEAD_P (*p)
-            && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
-       p += n, chars++;
+      while (p < adjusted_endp)
+       {
+         int n = multibyte_length (p, NULL, false, false);
+         if (n <= 0)
+           break;
+         p += n, chars++;
+       }
+    }
+  while (true)
+    {
+      int n = multibyte_length (p, endp, true, false);
+      if (n <= 0)
+       break;
+      p += n, chars++;
     }
-  while (p < endp
-        && ! CHAR_BYTE8_HEAD_P (*p)
-        && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
-    p += n, chars++;
   if (nchars)
     *nchars = chars;
   if (p == endp)
@@ -584,8 +588,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, 
ptrdiff_t nbytes,
       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
       while (p < adjusted_endp)
        {
-         if (! CHAR_BYTE8_HEAD_P (*p)
-             && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
+         int n = multibyte_length (p, NULL, false, false);
+         if (0 < n)
            {
              while (n--)
                *to++ = *p++;
@@ -601,8 +605,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, 
ptrdiff_t nbytes,
     }
   while (p < endp)
     {
-      if (! CHAR_BYTE8_HEAD_P (*p)
-         && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
+      int n = multibyte_length (p, endp, true, false);
+      if (0 < n)
        {
          while (n--)
            *to++ = *p++;
diff --git a/src/character.h b/src/character.h
index 81320de..4887473 100644
--- a/src/character.h
+++ b/src/character.h
@@ -31,15 +31,19 @@ INLINE_HEADER_BEGIN
 /* character code      1st byte   byte sequence
    --------------      --------   -------------
         0-7F           00..7F     0xxxxxxx
-       80-7FF          C2..DF     110xxxxx 10xxxxxx
-      800-FFFF         E0..EF     1110xxxx 10xxxxxx 10xxxxxx
-    10000-1FFFFF       F0..F7     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-   200000-3FFF7F       F8         11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx
+       80-7FF          C2..DF     110yyyyx 10xxxxxx
+      800-FFFF         E0..EF     1110yyyy 10yxxxxx 10xxxxxx
+    10000-1FFFFF       F0..F7     11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+   200000-3FFF7F       F8         11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx
    3FFF80-3FFFFF       C0..C1     1100000x 10xxxxxx (for eight-bit-char)
    400000-...          invalid
 
    invalid 1st byte    80..BF     10xxxxxx
-                       F9..FF     11111xxx (xxx != 000)
+                       F9..FF     11111yyy
+
+   In each bit pattern, 'x' and 'y' each represent a single bit of the
+   character code payload, and least one 'y' must be a 1 bit.
+   In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F.
 */
 
 /* Maximum character code ((1 << CHARACTERBITS) - 1).  */
@@ -284,7 +288,7 @@ CHAR_HEAD_P (int byte)
 }
 
 /* How many bytes a character that starts with BYTE occupies in a
-   multibyte form.  Unlike MULTIBYTE_LENGTH below, this function does not
+   multibyte form.  Unlike multibyte_length, this function does not
    validate the multibyte form, but looks only at its first byte.  */
 INLINE int
 BYTES_BY_CHAR_HEAD (int byte)
@@ -297,44 +301,54 @@ BYTES_BY_CHAR_HEAD (int byte)
 }
 
 
-/* The byte length of multibyte form at unibyte string P ending at
-   PEND.  If the string doesn't point to a valid multibyte form,
-   return 0.  Unlike BYTES_BY_CHAR_HEAD, this macro validates the
-   multibyte form.  */
+/* The byte length of the multibyte form at the unibyte string P,
+   ending at PEND if CHECK, and without a length check if !CHECK.
+   If ALLOW_8BIT, allow multibyte forms of eight-bit characters.
+   If the string doesn't point to a valid multibyte form, return 0.
+   Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form.  */
 
 INLINE int
-MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend)
-{
-  return (! (p < pend) ? 0
-         : ! (p[0] & 0x80) ? 1
-         : ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0
-         : (p[0] & 0xE0) == 0xC0 ? 2
-         : ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0
-         : (p[0] & 0xF0) == 0xE0 ? 3
-         : ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0
-         : (p[0] & 0xF8) == 0xF0 ? 4
-         : ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0
-         : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
-         : 0);
-}
-
-
-/* Like MULTIBYTE_LENGTH, but don't check the ending address.  The
-   multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD.  */
+multibyte_length (unsigned char const *p, unsigned char const *pend,
+                 bool check, bool allow_8bit)
+{
+  if (!check || p < pend)
+    {
+      unsigned char c = p[0];
+      if (c < 0x80)
+       return 1;
+      if (!check || p + 1 < pend)
+       {
+         /* The 'unsigned int' avoids int overflow in the 5-byte case.  */
+         unsigned int d = p[1];
+
+         if (TRAILING_CODE_P (d))
+           {
+             if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF)
+               return 2;
+             if ((!check || p + 2 < pend)
+                 && TRAILING_CODE_P (p[2]))
+               {
+                 if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20)))
+                   return 3;
+                 if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3]))
+                   {
+                     if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30)))
+                       return 4;
+                     if (c == 0xF8 && (!check || p + 4 < pend)
+                         && TRAILING_CODE_P (p[4]))
+                       {
+                         unsigned int w = ((d << 24) + (p[2] << 16)
+                                           + (p[3] << 8) + p[4]);
+                         if (0x88808080 <= w && w <= 0x8FBFBDBF)
+                           return 5;
+                       }
+                   }
+               }
+           }
+       }
+    }
 
-INLINE int
-MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p)
-{
-  return (!(p[0] & 0x80) ? 1
-         : (p[1] & 0xC0) != 0x80 ? 0
-         : (p[0] & 0xE0) == 0xC0 ? 2
-         : (p[2] & 0xC0) != 0x80 ? 0
-         : (p[0] & 0xF0) == 0xE0 ? 3
-         : (p[3] & 0xC0) != 0x80 ? 0
-         : (p[0] & 0xF8) == 0xF0 ? 4
-         : (p[4] & 0xC0) != 0x80 ? 0
-         : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
-         : 0);
+  return 0;
 }
 
 
diff --git a/src/coding.c b/src/coding.c
index 716b0d9..34f36d5 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -7670,15 +7670,17 @@ consume_chars (struct coding_system *coding, 
Lisp_Object translation_table,
 
       if (! multibytep)
        {
-         int bytes;
-
          if (coding->encoder == encode_coding_raw_text
              || coding->encoder == encode_coding_ccl)
            c = *src++, pos++;
-         else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
-           c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
          else
-           c = BYTE8_TO_CHAR (*src), src++, pos++;
+           {
+             int bytes = multibyte_length (src, src_end, true, true);
+             if (0 < bytes)
+               c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
+             else
+               c = BYTE8_TO_CHAR (*src), src++, pos++;
+           }
        }
       else
        c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
@@ -7727,7 +7729,7 @@ consume_chars (struct coding_system *coding, Lisp_Object 
translation_table,
          for (i = 1; i < to_nchars; i++)
            *buf++ = XFIXNUM (AREF (trans, i));
          for (i = 1; i < from_nchars; i++, pos++)
-           src += MULTIBYTE_LENGTH_NO_CHECK (src);
+           src += multibyte_length (src, NULL, false, true);
        }
     }
 
diff --git a/test/src/buffer-tests.el b/test/src/buffer-tests.el
index 1c35669..6e87cb9 100644
--- a/test/src/buffer-tests.el
+++ b/test/src/buffer-tests.el
@@ -1313,4 +1313,18 @@ with parameters from the *Messages* buffer modification."
         (ovshould nonempty-eob-end 4 5)
         (ovshould empty-eob        5 5)))))
 
+(ert-deftest buffer-multibyte-overlong-sequences ()
+  (dolist (uni '("\xE0\x80\x80"
+                 "\xF0\x80\x80\x80"
+                 "\xF8\x8F\xBF\xBF\x80"))
+    (let ((multi (string-to-multibyte uni)))
+      (should
+       (string-equal
+        multi
+        (with-temp-buffer
+          (set-buffer-multibyte nil)
+          (insert uni)
+          (set-buffer-multibyte t)
+          (buffer-string)))))))
+
 ;;; buffer-tests.el ends here
[Prev in Thread]
Current Thread
[Next in Thread]
master c88a3be: Fix string-to-multibyte overlong sequence bug, Paul Eggert <=
Prev by Date: master 856d937: * lisp/hi-lock.el (hi-lock--regexps-at-point): Use proper-list-p, not consp.
Next by Date: master f212daf: Fix recently introduced error in `tramp-sh-handle-vc-registered'
Previous by thread: master 856d937: * lisp/hi-lock.el (hi-lock--regexps-at-point): Use proper-list-p, not consp.
Next by thread: master f212daf: Fix recently introduced error in `tramp-sh-handle-vc-registered'
Index(es):
- Date
- Thread