bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: From wchar_t to char32_t


From: Bruno Haible
Subject: Re: From wchar_t to char32_t
Date: Tue, 04 Jul 2023 21:07:59 +0200

I wrote:
>   Level 2: Behave correctly, except that a 2-Unicode-character sequence
>            may be split although it shouldn't.
>            This is what code that uses mbrtoc32() does, when it has the
>            lines
>                 if (bytes == (size_t) -3)
>                   bytes = 0;
>            Without these lines, the string pointer could be decremented
>            by 3, thus accessing invalid memory or running into an endless
>            loop.
>            This level is also what
>              printf (".*s", nbytes, string);
>            does: it truncates strings at a position where they should not
>            be truncated. So, it's not terribly uncommon.
> 
>   Level 3: Behave correctly. Don't split a 2-Unicode-character sequence.
>            This is what code that uses mbrtoc32() does, when it has the
>            lines
>                 if (bytes == (size_t) -3)
>                   bytes = 0;
>            and uses !mbsinit (&state) in the loop termination condition.

The patch below brings all mbrtoc32 uses in gnulib to level 3.

There is nothing to do
  - in mbswidth.c and quotearg.c, because mbsinit (&state) is already part
    of the loop termination condition,
  - in mbuiter.h, because the termination condition is a NUL character.


2023-07-04  Bruno Haible  <bruno@clisp.org>

        mbiter, mbfile, mbmemcasecoll: Improve handling of mbrtoc32 result.
        * lib/mbiter.h (mbi_avail): If cur.ptr has reached the limit but
        in_shift is true, call mbiter_multi_next.
        (mbiter_multi_next): Set in_shift to false after an incomplete multibyte
        character.
        * lib/mbfile.h (mbfile_multi_getc): Pass the input bytes incrementally
        into mbrtoc32. When mbf->state is not in the initial state, call
        mbrtoc32 again.
        * lib/mbmemcasecoll.c (apply_c32tolower): When the state is not in the
        initial state, call mbrtoc32 again.

(diff -w patches below)
diff --git a/lib/mbiter.h b/lib/mbiter.h
index e4963010f3..8bd83d7262 100644
--- a/lib/mbiter.h
+++ b/lib/mbiter.h
@@ -155,8 +155,10 @@ mbiter_multi_next (struct mbiter_multi *iter)
           /* An incomplete multibyte character at the end.  */
           iter->cur.bytes = iter->limit - iter->cur.ptr;
           iter->cur.wc_valid = false;
-          /* Whether to set iter->in_shift = false and reset iter->state
-             or not is not important; the string end is reached anyway.  */
+          /* Cause the next mbi_avail invocation to return false.  */
+          iter->in_shift = false;
+          /* Whether to reset iter->state or not is not important; the
+             string end is reached anyway.  */
         }
       else
         {
@@ -208,7 +210,8 @@ typedef struct mbiter_multi mbi_iterator_t;
    (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
    (iter).next_done = false)
 #define mbi_avail(iter) \
-  ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true))
+  (((iter).cur.ptr < (iter).limit || (iter).in_shift) \
+   && (mbiter_multi_next (&(iter)), true))
 #define mbi_advance(iter) \
   ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
 
diff --git a/lib/mbfile.h b/lib/mbfile.h
index 7670dabfbb..ee1dd5a4b9 100644
--- a/lib/mbfile.h
+++ b/lib/mbfile.h
@@ -77,6 +77,7 @@ struct mbfile_multi {
 MBFILE_INLINE void
 mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
 {
+  unsigned int new_bufcount;
   size_t bytes;
 
   /* If EOF has already been seen, don't use getc.  This matters if
@@ -92,8 +93,14 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi 
*mbf)
       return;
     }
 
-  /* Before using mbrtowc, we need at least one byte.  */
-  if (mbf->bufcount == 0)
+  new_bufcount = mbf->bufcount;
+
+  /* If mbf->state is not in the initial state, some more 32-bit wide character
+     may be hiding in the state.  We need to call mbrtoc32 again.  */
+  if (mbsinit (&mbf->state))
+    {
+      /* Before using mbrtoc32, we need at least one byte.  */
+      if (new_bufcount == 0)
         {
           int c = getc (mbf->fp);
           if (c == EOF)
@@ -102,11 +109,11 @@ mbfile_multi_getc (struct mbchar *mbc, struct 
mbfile_multi *mbf)
               goto eof;
             }
           mbf->buf[0] = (unsigned char) c;
-      mbf->bufcount++;
+          new_bufcount++;
         }
 
-  /* Handle most ASCII characters quickly, without calling mbrtowc().  */
-  if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
+      /* Handle most ASCII characters quickly, without calling mbrtoc32().  */
+      if (new_bufcount == 1 && is_basic (mbf->buf[0]))
         {
           /* These characters are part of the POSIX portable character set.
              For most of them, namely those in the ISO C basic character set,
@@ -121,25 +128,16 @@ mbfile_multi_getc (struct mbchar *mbc, struct 
mbfile_multi *mbf)
           mbf->bufcount = 0;
           return;
         }
+    }
 
-  /* Use mbrtowc on an increasing number of bytes.  Read only as many bytes
+  /* Use mbrtoc32 on an increasing number of bytes.  Read only as many bytes
      from mbf->fp as needed.  This is needed to give reasonable interactive
      behaviour when mbf->fp is connected to an interactive tty.  */
   for (;;)
     {
-      /* We don't know whether the 'mbrtowc' function updates the state when
-         it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
-         not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour.  We
-         don't have an autoconf test for this, yet.
-         The new behaviour would allow us to feed the bytes one by one into
-         mbrtowc.  But the old behaviour forces us to feed all bytes since
-         the end of the last character into mbrtowc.  Since we want to retry
-         with more bytes when mbrtowc returns -2, we must backup the state
-         before calling mbrtowc, because implementations with the new
-         behaviour will clobber it.  */
-      mbstate_t backup_state = mbf->state;
-
-      bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
+      /* Feed the bytes one by one into mbrtoc32.  */
+      bytes = mbrtoc32 (&mbc->wc, &mbf->buf[mbf->bufcount], new_bufcount - 
mbf->bufcount, &mbf->state);
+      mbf->bufcount = new_bufcount;
 
       if (bytes == (size_t) -1)
         {
@@ -154,7 +152,6 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi 
*mbf)
       else if (bytes == (size_t) -2)
         {
           /* An incomplete multibyte character.  */
-          mbf->state = backup_state;
           if (mbf->bufcount == MBCHAR_BUF_SIZE)
             {
               /* An overlong incomplete multibyte sequence was encountered.  */
@@ -165,18 +162,18 @@ mbfile_multi_getc (struct mbchar *mbc, struct 
mbfile_multi *mbf)
             }
           else
             {
-              /* Read one more byte and retry mbrtowc.  */
+              /* Read one more byte and retry mbrtoc32.  */
               int c = getc (mbf->fp);
               if (c == EOF)
                 {
                   /* An incomplete multibyte character at the end.  */
                   mbf->eof_seen = true;
-                  bytes = mbf->bufcount;
+                  bytes = new_bufcount;
                   mbc->wc_valid = false;
                   break;
                 }
-              mbf->buf[mbf->bufcount] = (unsigned char) c;
-              mbf->bufcount++;
+              mbf->buf[new_bufcount] = (unsigned char) c;
+              new_bufcount++;
             }
         }
       else
diff --git a/lib/mbmemcasecoll.c b/lib/mbmemcasecoll.c
index de7390f6d1..f79f262dc0 100644
--- a/lib/mbmemcasecoll.c
+++ b/lib/mbmemcasecoll.c
@@ -51,15 +51,36 @@ apply_c32tolower (const char *inbuf, size_t inbufsize,
   remaining = inbufsize;
   while (remaining > 0)
     {
-      char32_t wc1;
-      size_t n1;
       mbstate_t state;
 
       memset (&state, '\0', sizeof (mbstate_t));
+      do
+        {
+          char32_t wc1;
+          size_t n1;
+
           n1 = mbrtoc32 (&wc1, inbuf, remaining, &state);
-      if (n1 == (size_t)(-2))
+
+          if (n1 == (size_t)(-1))
+            {
+              /* Invalid multibyte character on input.
+                 Copy one byte without modification.  */
+              *outbuf++ = *inbuf++;
+              remaining -= 1;
+              break;
+            }
+          else if (n1 == (size_t)(-2))
+            {
+              /* Incomplete multibyte sequence on input.
+                 Pass it through unmodified.  */
+              while (remaining > 0)
+                {
+                  *outbuf++ = *inbuf++;
+                  remaining -= 1;
+                }
               break;
-      if (n1 != (size_t)(-1))
+            }
+          else
             {
               wint_t wc2;
 
@@ -71,39 +92,28 @@ apply_c32tolower (const char *inbuf, size_t inbufsize,
               wc2 = c32tolower (wc1);
               if (wc2 != wc1)
                 {
+                  mbstate_t state2;
                   size_t n2;
 
-              memset (&state, '\0', sizeof (mbstate_t));
-              n2 = c32rtomb (outbuf, wc2, &state);
+                  memset (&state2, '\0', sizeof (mbstate_t));
+                  n2 = c32rtomb (outbuf, wc2, &state2);
                   if (n2 != (size_t)(-1))
                     {
                       /* Store the translated multibyte character.  */
-                  inbuf += n1;
-                  remaining -= n1;
                       outbuf += n2;
-                  continue;
+                      goto done_storing;
                     }
                 }
 
               /* Nothing to translate. */
               memcpy (outbuf, inbuf, n1);
+              outbuf += n1;
+             done_storing:
               inbuf += n1;
               remaining -= n1;
-          outbuf += n1;
-          continue;
             }
-
-      /* Invalid multibyte character on input.
-         Copy one byte without modification.  */
-      *outbuf++ = *inbuf++;
-      remaining -= 1;
         }
-  /* Incomplete multibyte sequence on input.
-     Pass it through unmodified.  */
-  while (remaining > 0)
-    {
-      *outbuf++ = *inbuf++;
-      remaining -= 1;
+      while (! mbsinit (&state));
     }
 
   /* Verify the output buffer was large enough.  */






reply via email to

[Prev in Thread] Current Thread [Next in Thread]