[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Reducing iconv-induced memory usage
From: |
Ludovic Courtès |
Subject: |
Re: Reducing iconv-induced memory usage |
Date: |
Wed, 27 Apr 2011 00:41:05 +0200 |
User-agent: |
Gnus/5.110015 (No Gnus v0.15) Emacs/23.3 (gnu/linux) |
Hi!
So, here’s the patch.
It also makes UTF-8 input ~30% faster according to ports.bm (which
doesn’t benchmark output):
* before:
("ports.bm: peek-char: latin-1 port" 700000 user 0.36)
("ports.bm: peek-char: utf-8 port, ascii character" 700000 user 0.35)
("ports.bm: peek-char: utf-8 port, Korean character" 700000 user 0.61)
("ports.bm: read-char: latin-1 port" 10000000 user 3.32)
("ports.bm: read-char: utf-8 port, ascii character" 10000000 user 3.33)
("ports.bm: read-char: utf-8 port, Korean character" 10000000 user 3.39)
("ports.bm: char-ready?: latin-1 port" 10000000 user 2.95)
("ports.bm: char-ready?: utf-8 port, ascii character" 10000000 user 2.96)
("ports.bm: char-ready?: utf-8 port, Korean character" 10000000 user 3.01)
("ports.bm: rdelim: read-line" 1000 user 3.1)
* after:
("ports.bm: peek-char: latin-1 port" 700000 user 0.31)
("ports.bm: peek-char: utf-8 port, ascii character" 700000 user 0.24)
("ports.bm: peek-char: utf-8 port, Korean character" 700000 user 0.3)
("ports.bm: read-char: latin-1 port" 10000000 user 2.73)
("ports.bm: read-char: utf-8 port, ascii character" 10000000 user 3.38)
("ports.bm: read-char: utf-8 port, Korean character" 10000000 user 3.37)
("ports.bm: char-ready?: latin-1 port" 10000000 user 2.42)
("ports.bm: char-ready?: utf-8 port, ascii character" 10000000 user 2.41)
("ports.bm: char-ready?: utf-8 port, Korean character" 10000000 user 2.43)
("ports.bm: rdelim: read-line" 1000 user 1.91)
Comments? OK to apply?
Thanks,
Ludo’.
diff --git a/libguile/ports.c b/libguile/ports.c
index 6e0ae6c..d728356 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1057,6 +1057,7 @@ update_port_lf (scm_t_wchar c, SCM port)
switch (c)
{
case '\a':
+ case EOF:
break;
case '\b':
SCM_DECCOL (port);
@@ -1115,23 +1116,113 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t
size)
return codepoint;
}
-/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
- with the byte representation of the codepoint in PORT's encoding, and
- set *LEN to the length in bytes of that representation. Return 0 on
- success and an errno value on error. */
+/* Read a UTF-8 sequence from PORT. On success, return 0 and set
+ *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
+ representation, and set *LEN to the length in bytes. Return
+ `EILSEQ' on error. */
static int
-get_codepoint (SCM port, scm_t_wchar *codepoint,
- char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
+ scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ int byte;
+
+ *len = 0;
+
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF)
+ {
+ *codepoint = EOF;
+ return 0;
+ }
+
+ buf[0] = (scm_t_uint8) byte;
+ *len = 1;
+
+ if (buf[0] <= 0x7f)
+ *codepoint = buf[0];
+ else if ((buf[0] & 0xe0) == 0xc0)
+ {
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF || ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
+ | (buf[1] & 0x3f);
+ }
+ else if ((buf[0] & 0xf0) == 0xe0)
+ {
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF || ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF || ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
+ | ((scm_t_wchar) buf[1] & 0x3f) << 6UL
+ | (buf[2] & 0x3f);
+ }
+ else
+ {
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF || ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF || ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF || ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ buf[3] = (scm_t_uint8) byte;
+ *len = 4;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
+ | ((scm_t_wchar) buf[1] & 0x3f) << 12UL
+ | ((scm_t_wchar) buf[2] & 0x3f) << 6UL
+ | (buf[3] & 0x3f);
+ }
+
+ return 0;
+
+ invalid_seq:
+ /* Return the faulty byte. */
+ scm_unget_byte (byte, port);
+
+ return EILSEQ;
+}
+
+/* Likewise, read a byte sequence from PORT, passing it through its
+ input conversion descriptor. */
+static int
+get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
{
+ scm_t_port *pt;
int err, byte_read;
size_t bytes_consumed, output_size;
char *output;
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
- if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
- /* Initialize the conversion descriptors. */
- scm_i_set_port_encoding_x (port, pt->encoding);
+ pt = SCM_PTAB_ENTRY (port);
for (output_size = 0, output = (char *) utf8_buf,
bytes_consumed = 0, err = 0;
@@ -1174,10 +1265,44 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
output_size = sizeof (utf8_buf) - output_left;
}
- if (SCM_UNLIKELY (err != 0))
+
+ if (SCM_LIKELY (err == 0))
+ {
+ /* Convert the UTF8_BUF sequence to a Unicode code point. */
+ *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+ *len = bytes_consumed;
+ }
+
+ return err;
+}
+
+/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
+ with the byte representation of the codepoint in PORT's encoding, and
+ set *LEN to the length in bytes of that representation. Return 0 on
+ success and an errno value on error. */
+static int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ int err;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+ if (pt->input_cd == (iconv_t) -1)
+ /* Initialize the conversion descriptors, if needed. */
+ scm_i_set_port_encoding_x (port, pt->encoding);
+
+ if (pt->input_cd == (iconv_t) -1)
+ err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
+ else
+ err = get_iconv_codepoint (port, codepoint, buf, len);
+
+ if (SCM_LIKELY (err == 0))
+ update_port_lf (*codepoint, port);
+ else
{
- /* Reset the `iconv' state. */
- iconv (pt->input_cd, NULL, NULL, NULL, NULL);
+ if (pt->input_cd != (iconv_t) -1)
+ /* Reset the `iconv' state. */
+ iconv (pt->input_cd, NULL, NULL, NULL, NULL);
if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
{
@@ -1189,14 +1314,6 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
input encoding errors.) */
}
- else
- /* Convert the UTF8_BUF sequence to a Unicode code point. */
- *codepoint = utf8_to_codepoint (utf8_buf, output_size);
-
- if (SCM_LIKELY (err == 0))
- update_port_lf (*codepoint, port);
-
- *len = bytes_consumed;
return err;
}
@@ -2027,28 +2144,35 @@ scm_i_set_port_encoding_x (SCM port, const char
*encoding)
if (encoding == NULL)
encoding = "ISO-8859-1";
- pt->encoding = scm_gc_strdup (encoding, "port");
+ if (pt->encoding != encoding)
+ pt->encoding = scm_gc_strdup (encoding, "port");
- if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+ /* If ENCODING is UTF-8, then no conversion descriptor is opened
+ because we do I/O ourselves. This saves 100+ KiB for each
+ descriptor. */
+ if (strcmp (encoding, "UTF-8"))
{
- /* Open an input iconv conversion descriptor, from ENCODING
- to UTF-8. We choose UTF-8, not UTF-32, because iconv
- implementations can typically convert from anything to
- UTF-8, but not to UTF-32 (see
-
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
- new_input_cd = iconv_open ("UTF-8", encoding);
- if (new_input_cd == (iconv_t) -1)
- goto invalid_encoding;
- }
+ if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+ {
+ /* Open an input iconv conversion descriptor, from ENCODING
+ to UTF-8. We choose UTF-8, not UTF-32, because iconv
+ implementations can typically convert from anything to
+ UTF-8, but not to UTF-32 (see
+
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
+ new_input_cd = iconv_open ("UTF-8", encoding);
+ if (new_input_cd == (iconv_t) -1)
+ goto invalid_encoding;
+ }
- if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
- {
- new_output_cd = iconv_open (encoding, "UTF-8");
- if (new_output_cd == (iconv_t) -1)
+ if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
{
- if (new_input_cd != (iconv_t) -1)
- iconv_close (new_input_cd);
- goto invalid_encoding;
+ new_output_cd = iconv_open (encoding, "UTF-8");
+ if (new_output_cd == (iconv_t) -1)
+ {
+ if (new_input_cd != (iconv_t) -1)
+ iconv_close (new_input_cd);
+ goto invalid_encoding;
+ }
}
}
diff --git a/libguile/print.c b/libguile/print.c
index 1399566..d18c054 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -821,33 +821,58 @@ codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
return len;
}
-/* Display the LEN codepoints in STR to PORT according to STRATEGY;
- return the number of codepoints successfully displayed. If NARROW_P,
- then STR is interpreted as a sequence of `char', denoting a Latin-1
- string; otherwise it's interpreted as a sequence of
- `scm_t_wchar'. */
-static size_t
-display_string (const void *str, int narrow_p,
- size_t len, SCM port,
- scm_t_string_failed_conversion_handler strategy)
-
-{
#define STR_REF(s, x) \
(narrow_p \
? (scm_t_wchar) ((unsigned char *) (s))[x] \
: ((scm_t_wchar *) (s))[x])
+/* Write STR to PORT as UTF-8. STR is a LEN-codepoint string; it is
+ narrow if NARROW_P is true, wide otherwise. Return LEN. */
+static size_t
+display_string_as_utf8 (const void *str, int narrow_p, size_t len,
+ SCM port)
+{
+ size_t printed = 0;
+
+ while (len > printed)
+ {
+ size_t utf8_len, i;
+ char *input, utf8_buf[256];
+
+ /* Convert STR to UTF-8. */
+ for (i = printed, utf8_len = 0, input = utf8_buf;
+ i < len && utf8_len + 4 < sizeof (utf8_buf);
+ i++)
+ {
+ utf8_len += codepoint_to_utf8 (STR_REF (str, i),
+ (scm_t_uint8 *) input);
+ input = utf8_buf + utf8_len;
+ }
+
+ /* INPUT was successfully converted, entirely; print the
+ result. */
+ scm_lfwrite (utf8_buf, utf8_len, port);
+ printed += i - printed;
+ }
+
+ assert (printed == len);
+
+ return len;
+}
+
+/* Convert STR through PORT's output conversion descriptor and write the
+ output to PORT. Return the number of codepoints written. */
+static size_t
+display_string_using_iconv (const void *str, int narrow_p, size_t len,
+ SCM port,
+ scm_t_string_failed_conversion_handler strategy)
+{
size_t printed;
scm_t_port *pt;
pt = SCM_PTAB_ENTRY (port);
- if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
- /* Initialize the conversion descriptors. */
- scm_i_set_port_encoding_x (port, pt->encoding);
-
printed = 0;
-
while (len > printed)
{
size_t done, utf8_len, input_left, output_left, i;
@@ -880,7 +905,7 @@ display_string (const void *str, int narrow_p,
if (SCM_UNLIKELY (done == (size_t) -1))
{
- int errno_save = errno;
+ int errno_save = errno;
/* Reset the `iconv' state. */
iconv (pt->output_cd, NULL, NULL, NULL, NULL);
@@ -928,7 +953,34 @@ display_string (const void *str, int narrow_p,
}
return printed;
+}
+
#undef STR_REF
+
+/* Display the LEN codepoints in STR to PORT according to STRATEGY;
+ return the number of codepoints successfully displayed. If NARROW_P,
+ then STR is interpreted as a sequence of `char', denoting a Latin-1
+ string; otherwise it's interpreted as a sequence of
+ `scm_t_wchar'. */
+static size_t
+display_string (const void *str, int narrow_p,
+ size_t len, SCM port,
+ scm_t_string_failed_conversion_handler strategy)
+
+{
+ scm_t_port *pt;
+
+ pt = SCM_PTAB_ENTRY (port);
+
+ if (pt->output_cd == (iconv_t) -1)
+ /* Initialize the conversion descriptors, if needed. */
+ scm_i_set_port_encoding_x (port, pt->encoding);
+
+ if (pt->output_cd == (iconv_t) -1)
+ return display_string_as_utf8 (str, narrow_p, len, port);
+ else
+ return display_string_using_cd (str, narrow_p, len,
+ port, strategy);
}
/* Attempt to display CH to PORT according to STRATEGY. Return non-zero
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 9d3000c..d5b1b60 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -391,7 +391,8 @@
(with-fluids ((%default-port-encoding e))
(call-with-output-string
(lambda (p)
- (display (port-encoding p) p)))))
+ (and (string=? e (port-encoding p))
+ (display (port-encoding p) p))))))
encodings)
encodings)))
@@ -462,6 +463,15 @@
(= (port-line p) 0)
(= (port-column p) 0))))
+ (pass-if "peek-char [utf-16]"
+ (let ((p (with-fluids ((%default-port-encoding "UTF-16BE"))
+ (open-input-string "안녕하세요"))))
+ (and (char=? (peek-char p) #\안)
+ (char=? (peek-char p) #\안)
+ (char=? (peek-char p) #\안)
+ (= (port-line p) 0)
+ (= (port-column p) 0))))
+
(pass-if "read-char, wrong encoding, error"
(let ((p (open-bytevector-input-port #vu8(255 65 66 67))))
(catch 'decoding-error