[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: From wchar_t to char32_t
From: |
Bruno Haible |
Subject: |
Re: From wchar_t to char32_t |
Date: |
Tue, 04 Jul 2023 21:07:59 +0200 |
I wrote:
> Level 2: Behave correctly, except that a 2-Unicode-character sequence
> may be split although it shouldn't.
> This is what code that uses mbrtoc32() does, when it has the
> lines
> if (bytes == (size_t) -3)
> bytes = 0;
> Without these lines, the string pointer could be decremented
> by 3, thus accessing invalid memory or running into an endless
> loop.
> This level is also what
> printf (".*s", nbytes, string);
> does: it truncates strings at a position where they should not
> be truncated. So, it's not terribly uncommon.
>
> Level 3: Behave correctly. Don't split a 2-Unicode-character sequence.
> This is what code that uses mbrtoc32() does, when it has the
> lines
> if (bytes == (size_t) -3)
> bytes = 0;
> and uses !mbsinit (&state) in the loop termination condition.
The patch below brings all mbrtoc32 uses in gnulib to level 3.
There is nothing to do
- in mbswidth.c and quotearg.c, because mbsinit (&state) is already part
of the loop termination condition,
- in mbuiter.h, because the termination condition is a NUL character.
2023-07-04 Bruno Haible <bruno@clisp.org>
mbiter, mbfile, mbmemcasecoll: Improve handling of mbrtoc32 result.
* lib/mbiter.h (mbi_avail): If cur.ptr has reached the limit but
in_shift is true, call mbiter_multi_next.
(mbiter_multi_next): Set in_shift to false after an incomplete multibyte
character.
* lib/mbfile.h (mbfile_multi_getc): Pass the input bytes incrementally
into mbrtoc32. When mbf->state is not in the initial state, call
mbrtoc32 again.
* lib/mbmemcasecoll.c (apply_c32tolower): When the state is not in the
initial state, call mbrtoc32 again.
(diff -w patches below)
diff --git a/lib/mbiter.h b/lib/mbiter.h
index e4963010f3..8bd83d7262 100644
--- a/lib/mbiter.h
+++ b/lib/mbiter.h
@@ -155,8 +155,10 @@ mbiter_multi_next (struct mbiter_multi *iter)
/* An incomplete multibyte character at the end. */
iter->cur.bytes = iter->limit - iter->cur.ptr;
iter->cur.wc_valid = false;
- /* Whether to set iter->in_shift = false and reset iter->state
- or not is not important; the string end is reached anyway. */
+ /* Cause the next mbi_avail invocation to return false. */
+ iter->in_shift = false;
+ /* Whether to reset iter->state or not is not important; the
+ string end is reached anyway. */
}
else
{
@@ -208,7 +210,8 @@ typedef struct mbiter_multi mbi_iterator_t;
(iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
(iter).next_done = false)
#define mbi_avail(iter) \
- ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true))
+ (((iter).cur.ptr < (iter).limit || (iter).in_shift) \
+ && (mbiter_multi_next (&(iter)), true))
#define mbi_advance(iter) \
((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
diff --git a/lib/mbfile.h b/lib/mbfile.h
index 7670dabfbb..ee1dd5a4b9 100644
--- a/lib/mbfile.h
+++ b/lib/mbfile.h
@@ -77,6 +77,7 @@ struct mbfile_multi {
MBFILE_INLINE void
mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
{
+ unsigned int new_bufcount;
size_t bytes;
/* If EOF has already been seen, don't use getc. This matters if
@@ -92,8 +93,14 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi
*mbf)
return;
}
- /* Before using mbrtowc, we need at least one byte. */
- if (mbf->bufcount == 0)
+ new_bufcount = mbf->bufcount;
+
+ /* If mbf->state is not in the initial state, some more 32-bit wide character
+ may be hiding in the state. We need to call mbrtoc32 again. */
+ if (mbsinit (&mbf->state))
+ {
+ /* Before using mbrtoc32, we need at least one byte. */
+ if (new_bufcount == 0)
{
int c = getc (mbf->fp);
if (c == EOF)
@@ -102,11 +109,11 @@ mbfile_multi_getc (struct mbchar *mbc, struct
mbfile_multi *mbf)
goto eof;
}
mbf->buf[0] = (unsigned char) c;
- mbf->bufcount++;
+ new_bufcount++;
}
- /* Handle most ASCII characters quickly, without calling mbrtowc(). */
- if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
+ /* Handle most ASCII characters quickly, without calling mbrtoc32(). */
+ if (new_bufcount == 1 && is_basic (mbf->buf[0]))
{
/* These characters are part of the POSIX portable character set.
For most of them, namely those in the ISO C basic character set,
@@ -121,25 +128,16 @@ mbfile_multi_getc (struct mbchar *mbc, struct
mbfile_multi *mbf)
mbf->bufcount = 0;
return;
}
+ }
- /* Use mbrtowc on an increasing number of bytes. Read only as many bytes
+ /* Use mbrtoc32 on an increasing number of bytes. Read only as many bytes
from mbf->fp as needed. This is needed to give reasonable interactive
behaviour when mbf->fp is connected to an interactive tty. */
for (;;)
{
- /* We don't know whether the 'mbrtowc' function updates the state when
- it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
- not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We
- don't have an autoconf test for this, yet.
- The new behaviour would allow us to feed the bytes one by one into
- mbrtowc. But the old behaviour forces us to feed all bytes since
- the end of the last character into mbrtowc. Since we want to retry
- with more bytes when mbrtowc returns -2, we must backup the state
- before calling mbrtowc, because implementations with the new
- behaviour will clobber it. */
- mbstate_t backup_state = mbf->state;
-
- bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
+ /* Feed the bytes one by one into mbrtoc32. */
+ bytes = mbrtoc32 (&mbc->wc, &mbf->buf[mbf->bufcount], new_bufcount -
mbf->bufcount, &mbf->state);
+ mbf->bufcount = new_bufcount;
if (bytes == (size_t) -1)
{
@@ -154,7 +152,6 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi
*mbf)
else if (bytes == (size_t) -2)
{
/* An incomplete multibyte character. */
- mbf->state = backup_state;
if (mbf->bufcount == MBCHAR_BUF_SIZE)
{
/* An overlong incomplete multibyte sequence was encountered. */
@@ -165,18 +162,18 @@ mbfile_multi_getc (struct mbchar *mbc, struct
mbfile_multi *mbf)
}
else
{
- /* Read one more byte and retry mbrtowc. */
+ /* Read one more byte and retry mbrtoc32. */
int c = getc (mbf->fp);
if (c == EOF)
{
/* An incomplete multibyte character at the end. */
mbf->eof_seen = true;
- bytes = mbf->bufcount;
+ bytes = new_bufcount;
mbc->wc_valid = false;
break;
}
- mbf->buf[mbf->bufcount] = (unsigned char) c;
- mbf->bufcount++;
+ mbf->buf[new_bufcount] = (unsigned char) c;
+ new_bufcount++;
}
}
else
diff --git a/lib/mbmemcasecoll.c b/lib/mbmemcasecoll.c
index de7390f6d1..f79f262dc0 100644
--- a/lib/mbmemcasecoll.c
+++ b/lib/mbmemcasecoll.c
@@ -51,15 +51,36 @@ apply_c32tolower (const char *inbuf, size_t inbufsize,
remaining = inbufsize;
while (remaining > 0)
{
- char32_t wc1;
- size_t n1;
mbstate_t state;
memset (&state, '\0', sizeof (mbstate_t));
+ do
+ {
+ char32_t wc1;
+ size_t n1;
+
n1 = mbrtoc32 (&wc1, inbuf, remaining, &state);
- if (n1 == (size_t)(-2))
+
+ if (n1 == (size_t)(-1))
+ {
+ /* Invalid multibyte character on input.
+ Copy one byte without modification. */
+ *outbuf++ = *inbuf++;
+ remaining -= 1;
+ break;
+ }
+ else if (n1 == (size_t)(-2))
+ {
+ /* Incomplete multibyte sequence on input.
+ Pass it through unmodified. */
+ while (remaining > 0)
+ {
+ *outbuf++ = *inbuf++;
+ remaining -= 1;
+ }
break;
- if (n1 != (size_t)(-1))
+ }
+ else
{
wint_t wc2;
@@ -71,39 +92,28 @@ apply_c32tolower (const char *inbuf, size_t inbufsize,
wc2 = c32tolower (wc1);
if (wc2 != wc1)
{
+ mbstate_t state2;
size_t n2;
- memset (&state, '\0', sizeof (mbstate_t));
- n2 = c32rtomb (outbuf, wc2, &state);
+ memset (&state2, '\0', sizeof (mbstate_t));
+ n2 = c32rtomb (outbuf, wc2, &state2);
if (n2 != (size_t)(-1))
{
/* Store the translated multibyte character. */
- inbuf += n1;
- remaining -= n1;
outbuf += n2;
- continue;
+ goto done_storing;
}
}
/* Nothing to translate. */
memcpy (outbuf, inbuf, n1);
+ outbuf += n1;
+ done_storing:
inbuf += n1;
remaining -= n1;
- outbuf += n1;
- continue;
}
-
- /* Invalid multibyte character on input.
- Copy one byte without modification. */
- *outbuf++ = *inbuf++;
- remaining -= 1;
}
- /* Incomplete multibyte sequence on input.
- Pass it through unmodified. */
- while (remaining > 0)
- {
- *outbuf++ = *inbuf++;
- remaining -= 1;
+ while (! mbsinit (&state));
}
/* Verify the output buffer was large enough. */
- Re: From wchar_t to char32_t, (continued)
- Re: From wchar_t to char32_t, Bruno Haible, 2023/07/11
- Re: From wchar_t to char32_t, Paul Eggert, 2023/07/11
- Re: From wchar_t to char32_t, Bruno Haible, 2023/07/13
- Re: From wchar_t to char32_t, Paul Eggert, 2023/07/13
- Re: From wchar_t to char32_t, new module mbszero, Bruno Haible, 2023/07/16
- Re: From wchar_t to char32_t, new module mbszero, Paul Eggert, 2023/07/16
- Re: From wchar_t to char32_t, new module mbszero, Bruno Haible, 2023/07/17
- Re: From wchar_t to char32_t, new module mbszero, Paul Eggert, 2023/07/18
- Re: From wchar_t to char32_t, new module mbszero, Bruno Haible, 2023/07/19
- Re: From wchar_t to char32_t, new module mbszero, Bruno Haible, 2023/07/17
- Re: From wchar_t to char32_t,
Bruno Haible <=
- Re: From wchar_t to char32_t, Paul Eggert, 2023/07/11
- Re: From wchar_t to char32_t, Bruno Haible, 2023/07/11
- Re: From wchar_t to char32_t, Paul Eggert, 2023/07/11
Re: From wchar_t to char32_t, Bruno Haible, 2023/07/02
Re: From wchar_t to char32_t, Bruno Haible, 2023/07/02
Re: From wchar_t to char32_t, Bruno Haible, 2023/07/12
Re: From wchar_t to char32_t, Bruno Haible, 2023/07/13