[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
unistr/u8-*: Make Unicode decoder more Unicode Standard compliant
From: |
Bruno Haible |
Subject: |
unistr/u8-*: Make Unicode decoder more Unicode Standard compliant |
Date: |
Tue, 25 Jul 2023 22:35:57 +0200 |
This patch makes gnulib's and libunistring's UTF-8 decoder (mainly
u8_mbtouc) more Unicode Standard compliant, regarding
https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf section 3.9.
2023-07-25 Bruno Haible <bruno@clisp.org>
unistr/u8-*: Make Unicode decoder more Unicode Standard compliant.
Based on a remark by Paul Eggert in
<https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00120.html>.
* tests/unistr/test-u8-mbtouc.c (test_safe_function): Change expected
results for "non-shortest form" or out-of-range byte sequences. Add new
test cases of incomplete well-formed byte sequences.
* tests/unistr/test-u8-mbsnlen.c (main): Likewise.
* lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Reject a first byte in the
range 0xF5..0xF7 as invalid. Distinguish incomplete from invalid byte
sequences correctly. For the former, return only the number of bytes in
the maximal well-formed subpart.
* lib/unistr/u8-mbtouc.c (u8_mbtouc): Likewise.
* lib/unistr/u8-check.c (u8_check): Reject a first byte in the range
0xF5..0xF7 as invalid.
* lib/unistr/u8-mblen.c (u8_mblen): Likewise.
* lib/unistr/u8-mbtoucr.c (u8_mbtoucr): Likewise.
* lib/unistr/u8-strmbtouc.c (u8_strmbtouc): Likewise.
* lib/unistr/u8-strmblen.c (u8_strmblen): Likewise.
* lib/unistr/u8-prev.c (u8_prev): Likewise.
diff --git a/lib/unistr/u8-check.c b/lib/unistr/u8-check.c
index 2f03cd9af0..53217006ea 100644
--- a/lib/unistr/u8-check.c
+++ b/lib/unistr/u8-check.c
@@ -57,13 +57,13 @@ u8_check (const uint8_t *s, size_t n)
continue;
}
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if (s + 4 <= s_end
&& (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
&& (s[3] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
s += 4;
continue;
diff --git a/lib/unistr/u8-mblen.c b/lib/unistr/u8-mblen.c
index a5f88dedef..d989afc437 100644
--- a/lib/unistr/u8-mblen.c
+++ b/lib/unistr/u8-mblen.c
@@ -47,13 +47,13 @@ u8_mblen (const uint8_t *s, size_t n)
&& (c != 0xed || s[1] < 0xa0))
return 3;
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if (n >= 4
&& (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
&& (s[3] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
return 4;
}
}
diff --git a/lib/unistr/u8-mbtouc-aux.c b/lib/unistr/u8-mbtouc-aux.c
index a6b7edcfb9..15568c3bc8 100644
--- a/lib/unistr/u8-mbtouc-aux.c
+++ b/lib/unistr/u8-mbtouc-aux.c
@@ -52,20 +52,15 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
{
if (n >= 3)
{
- if ((s[1] ^ 0x80) < 0x40)
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
{
if ((s[2] ^ 0x80) < 0x40)
{
- if ((c >= 0xe1 || s[1] >= 0xa0)
- && (c != 0xed || s[1] < 0xa0))
- {
- *puc = ((unsigned int) (c & 0x0f) << 12)
- | ((unsigned int) (s[1] ^ 0x80) << 6)
- | (unsigned int) (s[2] ^ 0x80);
- return 3;
- }
- /* invalid multibyte character */
- *puc = 0xfffd;
+ *puc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (s[1] ^ 0x80) << 6)
+ | (unsigned int) (s[2] ^ 0x80);
return 3;
}
/* invalid multibyte character */
@@ -73,38 +68,50 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
return 2;
}
/* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
}
else
{
- /* incomplete multibyte character */
*puc = 0xfffd;
- if (n == 1 || (s[1] ^ 0x80) >= 0x40)
- return 1;
+ if (n == 1)
+ {
+ /* incomplete multibyte character */
+ return 1;
+ }
else
- return 2;
+ {
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
+ {
+ /* incomplete multibyte character */
+ return 2;
+ }
+ else
+ {
+ /* invalid multibyte character */
+ return 1;
+ }
+ }
}
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if (n >= 4)
{
- if ((s[1] ^ 0x80) < 0x40)
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xf1 || s[1] >= 0x90)
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
if ((s[2] ^ 0x80) < 0x40)
{
if ((s[3] ^ 0x80) < 0x40)
{
- if ((c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
- {
- *puc = ((unsigned int) (c & 0x07) << 18)
- | ((unsigned int) (s[1] ^ 0x80) << 12)
- | ((unsigned int) (s[2] ^ 0x80) << 6)
- | (unsigned int) (s[3] ^ 0x80);
- return 4;
- }
- /* invalid multibyte character */
- *puc = 0xfffd;
+ *puc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (s[1] ^ 0x80) << 12)
+ | ((unsigned int) (s[2] ^ 0x80) << 6)
+ | (unsigned int) (s[3] ^ 0x80);
return 4;
}
/* invalid multibyte character */
@@ -116,17 +123,48 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
return 2;
}
/* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
}
else
{
- /* incomplete multibyte character */
*puc = 0xfffd;
- if (n == 1 || (s[1] ^ 0x80) >= 0x40)
- return 1;
- else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
- return 2;
+ if (n == 1)
+ {
+ /* incomplete multibyte character */
+ return 1;
+ }
else
- return 3;
+ {
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xf1 || s[1] >= 0x90)
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
+ {
+ if (n == 2)
+ {
+ /* incomplete multibyte character */
+ return 2;
+ }
+ else
+ {
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ /* incomplete multibyte character */
+ return 3;
+ }
+ else
+ {
+ /* invalid multibyte character */
+ return 2;
+ }
+ }
+ }
+ else
+ {
+ /* invalid multibyte character */
+ return 1;
+ }
+ }
}
}
}
diff --git a/lib/unistr/u8-mbtouc.c b/lib/unistr/u8-mbtouc.c
index e30e5203c1..920ad6f558 100644
--- a/lib/unistr/u8-mbtouc.c
+++ b/lib/unistr/u8-mbtouc.c
@@ -62,20 +62,15 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
{
if (n >= 3)
{
- if ((s[1] ^ 0x80) < 0x40)
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
{
if ((s[2] ^ 0x80) < 0x40)
{
- if ((c >= 0xe1 || s[1] >= 0xa0)
- && (c != 0xed || s[1] < 0xa0))
- {
- *puc = ((unsigned int) (c & 0x0f) << 12)
- | ((unsigned int) (s[1] ^ 0x80) << 6)
- | (unsigned int) (s[2] ^ 0x80);
- return 3;
- }
- /* invalid multibyte character */
- *puc = 0xfffd;
+ *puc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (s[1] ^ 0x80) << 6)
+ | (unsigned int) (s[2] ^ 0x80);
return 3;
}
/* invalid multibyte character */
@@ -83,38 +78,50 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
return 2;
}
/* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
}
else
{
- /* incomplete multibyte character */
*puc = 0xfffd;
- if (n == 1 || (s[1] ^ 0x80) >= 0x40)
- return 1;
+ if (n == 1)
+ {
+ /* incomplete multibyte character */
+ return 1;
+ }
else
- return 2;
+ {
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
+ {
+ /* incomplete multibyte character */
+ return 2;
+ }
+ else
+ {
+ /* invalid multibyte character */
+ return 1;
+ }
+ }
}
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if (n >= 4)
{
- if ((s[1] ^ 0x80) < 0x40)
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xf1 || s[1] >= 0x90)
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
if ((s[2] ^ 0x80) < 0x40)
{
if ((s[3] ^ 0x80) < 0x40)
{
- if ((c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
- {
- *puc = ((unsigned int) (c & 0x07) << 18)
- | ((unsigned int) (s[1] ^ 0x80) << 12)
- | ((unsigned int) (s[2] ^ 0x80) << 6)
- | (unsigned int) (s[3] ^ 0x80);
- return 4;
- }
- /* invalid multibyte character */
- *puc = 0xfffd;
+ *puc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (s[1] ^ 0x80) << 12)
+ | ((unsigned int) (s[2] ^ 0x80) << 6)
+ | (unsigned int) (s[3] ^ 0x80);
return 4;
}
/* invalid multibyte character */
@@ -126,17 +133,48 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
return 2;
}
/* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
}
else
{
- /* incomplete multibyte character */
*puc = 0xfffd;
- if (n == 1 || (s[1] ^ 0x80) >= 0x40)
- return 1;
- else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
- return 2;
+ if (n == 1)
+ {
+ /* incomplete multibyte character */
+ return 1;
+ }
else
- return 3;
+ {
+ if ((s[1] ^ 0x80) < 0x40
+ && (c >= 0xf1 || s[1] >= 0x90)
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
+ {
+ if (n == 2)
+ {
+ /* incomplete multibyte character */
+ return 2;
+ }
+ else
+ {
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ /* incomplete multibyte character */
+ return 3;
+ }
+ else
+ {
+ /* invalid multibyte character */
+ return 2;
+ }
+ }
+ }
+ else
+ {
+ /* invalid multibyte character */
+ return 1;
+ }
+ }
}
}
}
diff --git a/lib/unistr/u8-mbtoucr.c b/lib/unistr/u8-mbtoucr.c
index d09051128f..296062d233 100644
--- a/lib/unistr/u8-mbtoucr.c
+++ b/lib/unistr/u8-mbtoucr.c
@@ -86,13 +86,13 @@ u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n)
return -2;
}
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if (n >= 2)
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
if (n >= 3)
{
diff --git a/lib/unistr/u8-prev.c b/lib/unistr/u8-prev.c
index 1012486b36..ad8a347c19 100644
--- a/lib/unistr/u8-prev.c
+++ b/lib/unistr/u8-prev.c
@@ -63,9 +63,9 @@ u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start)
{
uint8_t c_4 = s[-4];
- if (c_4 >= 0xf0 && c_4 < 0xf8
+ if (c_4 >= 0xf0 && c_4 <= 0xf4
&& (c_4 >= 0xf1 || c_3 >= 0x90)
- && (c_4 < 0xf4 || (c_4 == 0xf4 && c_3 < 0x90)))
+ && (c_4 < 0xf4 || (/* c_4 == 0xf4 && */ c_3 <
0x90)))
{
*puc = ((unsigned int) (c_4 & 0x07) << 18)
| ((unsigned int) (c_3 ^ 0x80) << 12)
diff --git a/lib/unistr/u8-strmblen.c b/lib/unistr/u8-strmblen.c
index 558771341a..a34a01fc14 100644
--- a/lib/unistr/u8-strmblen.c
+++ b/lib/unistr/u8-strmblen.c
@@ -51,12 +51,12 @@ u8_strmblen (const uint8_t *s)
&& (c != 0xed || s[1] < 0xa0))
return 3;
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
&& (s[3] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
return 4;
}
}
diff --git a/lib/unistr/u8-strmbtouc.c b/lib/unistr/u8-strmbtouc.c
index a47fbbb84f..259d3c2f37 100644
--- a/lib/unistr/u8-strmbtouc.c
+++ b/lib/unistr/u8-strmbtouc.c
@@ -63,12 +63,12 @@ u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
return 3;
}
}
- else if (c < 0xf8)
+ else if (c <= 0xf4)
{
if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
&& (s[3] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+ && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
*puc = ((unsigned int) (c & 0x07) << 18)
| ((unsigned int) (s[1] ^ 0x80) << 12)
diff --git a/tests/unistr/test-u8-mbsnlen.c b/tests/unistr/test-u8-mbsnlen.c
index c0b9b6e3f1..67b80d02a7 100644
--- a/tests/unistr/test-u8-mbsnlen.c
+++ b/tests/unistr/test-u8-mbsnlen.c
@@ -61,9 +61,18 @@ main ()
that a "malformed sequence" is interpreted in the same way as
"a character that is outside the adopted subset".
Reference:
+ ISO 10646-1 amendment 2
+ <https://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
Markus Kuhn: UTF-8 decoder capability and stress test
<https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
<https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html>
+ These old specifications (from ca. 2000) were a bit ambiguous, and the
+ definition of UTF-8 has changed a bit as well. The newer specification
+ we obey is the Unicode Standard, version 15.
+ Reference:
+ Unicode Standard 15.0.0, section 3.9
+ <https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf>
+ pages 124..129, especially table 3-7.
*/
/* 3.1. Test that each unexpected continuation byte is signalled as a
malformed sequence of its own. */
@@ -97,9 +106,14 @@ main ()
}
/* 3.3.2. 3-byte sequence with last byte missing. */
{
- static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+ static const uint8_t input[] = { '"', 0xE0, 0xA0, '"' };
ASSERT (u8_mbsnlen (input, 4) == 3);
}
+ {
+ /* Outdated example: 0xE0 0x80 is an ill-formed sequence. */
+ static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+ ASSERT (u8_mbsnlen (input, 4) == 4);
+ }
/* 3.3.7. 3-byte sequence with last byte missing. */
{
static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' };
@@ -107,14 +121,24 @@ main ()
}
/* 3.3.3. 4-byte sequence with last byte missing. */
{
- static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+ static const uint8_t input[] = { '"', 0xF0, 0x90, 0x80, '"' };
ASSERT (u8_mbsnlen (input, 5) == 3);
}
+ {
+ /* Outdated example: 0xF0 0x80 is an ill-formed sequence. */
+ static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+ ASSERT (u8_mbsnlen (input, 5) == 5);
+ }
/* 3.3.8. 4-byte sequence with last byte missing. */
{
- static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+ static const uint8_t input[] = { '"', 0xF3, 0xBF, 0xBF, '"' };
ASSERT (u8_mbsnlen (input, 5) == 3);
}
+ {
+ /* Outdated example: 0xF7 is an invalid first byte. */
+ static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+ ASSERT (u8_mbsnlen (input, 5) == 5);
+ }
return 0;
}
diff --git a/tests/unistr/test-u8-mbtouc.c b/tests/unistr/test-u8-mbtouc.c
index 35c70c2193..a695ba1c70 100644
--- a/tests/unistr/test-u8-mbtouc.c
+++ b/tests/unistr/test-u8-mbtouc.c
@@ -34,9 +34,18 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const
uint8_t *, size_t))
that a "malformed sequence" is interpreted in the same way as
"a character that is outside the adopted subset".
Reference:
+ ISO 10646-1 amendment 2
+ <https://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
Markus Kuhn: UTF-8 decoder capability and stress test
<https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
<https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html>
+ These old specifications (from ca. 2000) were a bit ambiguous, and the
+ definition of UTF-8 has changed a bit as well. The newer specification
+ we obey is the Unicode Standard, version 15.
+ Reference:
+ Unicode Standard 15.0.0, section 3.9
+ <https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf>
+ pages 124..129, especially table 3-7.
*/
/* 3.1. Test that each unexpected continuation byte is signalled as a
malformed sequence of its own. */
@@ -118,7 +127,7 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const
uint8_t *, size_t))
}
/* 3.3.2. 3-byte sequence with last byte missing. */
{
- static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+ static const uint8_t input[] = { '"', 0xE0, 0xA0, '"' };
uc = 0xBADFACE;
ret = my_u8_mbtouc (&uc, input, 4);
ASSERT (ret == 1);
@@ -132,6 +141,26 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const
uint8_t *, size_t))
ASSERT (ret == 1);
ASSERT (uc == 0x0022);
}
+ {
+ /* Outdated example: 0xE0 0x80 is an ill-formed sequence. */
+ static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 4);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 3);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 2, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 3, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
/* 3.3.7. 3-byte sequence with last byte missing. */
{
static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' };
@@ -150,7 +179,7 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const
uint8_t *, size_t))
}
/* 3.3.3. 4-byte sequence with last byte missing. */
{
- static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+ static const uint8_t input[] = { '"', 0xF0, 0x90, 0x80, '"' };
uc = 0xBADFACE;
ret = my_u8_mbtouc (&uc, input, 5);
ASSERT (ret == 1);
@@ -164,9 +193,33 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const
uint8_t *, size_t))
ASSERT (ret == 1);
ASSERT (uc == 0x0022);
}
+ {
+ /* Outdated example: 0xF0 0x80 is an ill-formed sequence. */
+ static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 5);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 4);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 2, 3);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 3, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 4, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
/* 3.3.8. 4-byte sequence with last byte missing. */
{
- static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+ static const uint8_t input[] = { '"', 0xF3, 0xBF, 0xBF, '"' };
uc = 0xBADFACE;
ret = my_u8_mbtouc (&uc, input, 5);
ASSERT (ret == 1);
@@ -180,6 +233,30 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const
uint8_t *, size_t))
ASSERT (ret == 1);
ASSERT (uc == 0x0022);
}
+ {
+ /* Outdated example: 0xF7 is an invalid first byte. */
+ static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 5);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 4);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 2, 3);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 3, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 4, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
}
int
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- unistr/u8-*: Make Unicode decoder more Unicode Standard compliant,
Bruno Haible <=