>From 5af2c2b651c8f997279fdd7799d0b353205fe236 Mon Sep 17 00:00:00 2001 From: Michael Gran Date: Sun, 2 Aug 2009 16:26:07 -0700 Subject: [PATCH] Add Unicode strings and symbols This adds full Unicode strings as a datatype, and it adds some minimal functionality. The terminal and port encoding is assumed to be ISO-8859-1. Non-ISO-8859-1 characters are written or input as string character escapes. The string character escapes now have 3 forms: \xXX \uXXXX and \UXXXXXX, for unprintable characters that have 2, 4 or 6 hex digits. The process for writing to strings has been modified. There is now a function scm_i_string_start_writing that does the copy-on-write conversion if necessary. To compile strings that may be wide, the VM storage of strings and string-likes has changed. Most string-using functions have not yet been updated and may break when used with wide strings. * module/language/assembly/compile-bytecode.scm (write-bytecode): use variable width string bytecode format * module/language/assembly.scm (byte-length): use variable width bytecode format * libguile/vm-i-loader.c (load-string, load-symbol): (load-keyword, define): use variable-width bytecode format * libguile/vm-engine.h (FETCH_WIDTH): new macro * libguile/strings.h: new declarations * libguile/strings.c (make_wide_stringbuf): new function (widen_stringbuf): new function (scm_i_make_wide_string): new function (scm_i_is_narrow_string): new function (scm_i_string_wide_chars): new function (scm_i_string_start_writing): new function (scm_i_string_ref): new function (scm_i_string_set_x): new function (scm_i_is_narrow_symbol): new function (scm_i_symbol_wide_chars, scm_i_symbol_ref): new function (scm_string_width): new function (unistring_escapes_to_guile_escapes): new function (scm_to_stringn): new function (scm_i_stringbuf_free): modify for wide strings (scm_i_substring_copy): modify for wide strings (scm_i_string_chars, scm_string_append): modify for wide strings (scm_i_make_symbol, scm_to_locale_stringn): modify for wide strings (scm_string_dump, scm_symbol_dump, scm_to_locale_stringbuf): (scm_string, scm_i_deprecated_string_chars): modify for wide strings (scm_from_locale_string, scm_from_locale_stringn): add null test * libguile/srfi-13.c: add calls for scm_i_string_start_writing for each call of scm_i_string_stop_writing (scm_string_for_each): modify for wide strings * libguile/socket.c: add calls for scm_i_string_start_writing for each call of scm_i_string_stop_writing * libguile/rw.c: add calls for scm_i_string_start_writing for each call of scm_i_string_stop_writing * libguile/read.c (scm_read_string): allow reading of wide strings * libguile/print.h: add declaration for scm_charprint * libguile/print.c (iprin1): print wide strings and add new string escapes (scm_charprint): new function * libguile/ports.h: new declarations for scm_lfwrite_substr and scm_lfwrite_str * libguile/ports.c (update_port_lf): new function (scm_lfwrite): use update_port_lf (scm_lfwrite_substr): new function (scm_lfwrite_str): new function --- libguile/ports.c | 90 +++- libguile/ports.h | 3 + libguile/print.c | 157 +++++-- libguile/print.h | 1 + libguile/read.c | 233 ++++++---- libguile/rw.c | 2 + libguile/socket.c | 3 + libguile/srfi-13.c | 19 +- libguile/strings.c | 639 +++++++++++++++++++++---- libguile/strings.h | 59 ++- libguile/vm-engine.h | 1 + libguile/vm-i-loader.c | 87 +++- module/language/assembly.scm | 12 +- module/language/assembly/compile-bytecode.scm | 26 +- 14 files changed, 1031 insertions(+), 301 deletions(-) diff --git a/libguile/ports.c b/libguile/ports.c index 627fd3f..2c1a389 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -969,7 +969,35 @@ scm_fill_input (SCM port) * This function differs from scm_c_write; it updates port line and * column. */ -void +static void +update_port_lf (scm_t_wchar c, SCM port) +{ + if (c == '\a') + { + } + else if (c == '\b') + { + SCM_DECCOL (port); + } + else if (c == '\n') + { + SCM_INCLINE (port); + } + else if (c == '\r') + { + SCM_ZEROCOL (port); + } + else if (c == '\t') + { + SCM_TABCOL (port); + } + else + { + SCM_INCCOL (port); + } +} + +void scm_lfwrite (const char *ptr, size_t size, SCM port) { scm_t_port *pt = SCM_PTAB_ENTRY (port); @@ -980,30 +1008,54 @@ scm_lfwrite (const char *ptr, size_t size, SCM port) ptob->write (port, ptr, size); - for (; size; ptr++, size--) { - if (*ptr == '\a') { - } - else if (*ptr == '\b') { - SCM_DECCOL(port); - } - else if (*ptr == '\n') { - SCM_INCLINE(port); - } - else if (*ptr == '\r') { - SCM_ZEROCOL(port); - } - else if (*ptr == '\t') { - SCM_TABCOL(port); - } - else { - SCM_INCCOL(port); + for (; size; ptr++, size--) + update_port_lf ((scm_t_wchar) (unsigned char) *ptr, port); + + if (pt->rw_random) + pt->rw_active = SCM_PORT_WRITE; +} + +/* Write a scheme string STR to PORT from START inclusive to END + exclusive. */ +void +scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port) +{ + size_t i, size = scm_i_string_length (str); + scm_t_port *pt = SCM_PTAB_ENTRY (port); + scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)]; + scm_t_wchar p; + char *buf; + size_t len; + + if (pt->rw_active == SCM_PORT_READ) + scm_end_input (port); + + if (end == -1) + end = size; + size = end - start; + + buf = scm_to_stringn (scm_c_substring (str, start, end), &len, + NULL, iconveh_escape_sequence); + ptob->write (port, buf, len); + free (buf); + + for (i = 0; i < size; i++) + { + p = scm_i_string_ref (str, i + start); + update_port_lf (p, port); } - } if (pt->rw_random) pt->rw_active = SCM_PORT_WRITE; } +/* Write a scheme string STR to PORT. */ +void +scm_lfwrite_str (SCM str, SCM port) +{ + scm_lfwrite_substr (str, 0, -1, port); +} + /* scm_c_read * * Used by an application to read arbitrary number of bytes from an diff --git a/libguile/ports.h b/libguile/ports.h index 8a21b09..d427fec 100644 --- a/libguile/ports.h +++ b/libguile/ports.h @@ -269,6 +269,9 @@ SCM_API SCM scm_read_char (SCM port); SCM_API size_t scm_c_read (SCM port, void *buffer, size_t size); SCM_API void scm_c_write (SCM port, const void *buffer, size_t size); SCM_API void scm_lfwrite (const char *ptr, size_t size, SCM port); +SCM_INTERNAL void scm_lfwrite_str (SCM str, SCM port); +SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t start, size_t end, + SCM port); SCM_API void scm_flush (SCM port); SCM_API void scm_end_input (SCM port); SCM_API int scm_fill_input (SCM port); diff --git a/libguile/print.c b/libguile/print.c index f43856b..6f31fcf 100644 --- a/libguile/print.c +++ b/libguile/print.c @@ -559,55 +559,113 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate) break; } break; - case scm_tc7_string: - if (SCM_WRITINGP (pstate)) - { - size_t i, j, len; - const char *data; + case scm_tc7_string: + if (SCM_WRITINGP (pstate)) + { + size_t i, j, len; + static char const hex[] = "0123456789abcdef"; + char buf[8]; - scm_putc ('"', port); - len = scm_i_string_length (exp); - data = scm_i_string_chars (exp); - for (i = 0, j = 0; i < len; ++i) - { - unsigned char ch = data[i]; - if ((ch < 32 && ch != '\n') || (127 <= ch && ch < 148)) - { - static char const hex[]="0123456789abcdef"; - char buf[4]; - - scm_lfwrite (data+j, i-j, port); - buf[0] = '\\'; - buf[1] = 'x'; - buf[2] = hex [ch / 16]; - buf[3] = hex [ch % 16]; - scm_lfwrite (buf, 4, port); - data = scm_i_string_chars (exp); - j = i+1; - } - else if (ch == '"' || ch == '\\') - { - scm_lfwrite (data+j, i-j, port); - scm_putc ('\\', port); - data = scm_i_string_chars (exp); - j = i; - } - } - scm_lfwrite (data+j, i-j, port); - scm_putc ('"', port); - scm_remember_upto_here_1 (exp); - } - else - scm_lfwrite (scm_i_string_chars (exp), scm_i_string_length (exp), - port); - scm_remember_upto_here_1 (exp); - break; + + scm_putc ('"', port); + len = scm_i_string_length (exp); + for (i = 0; i < len; ++i) + { + scm_t_wchar ch = scm_i_string_ref (exp, i); + int printed = 0; + + if (ch == ' ' || ch == '\n') + { + scm_putc (ch, port); + printed = 1; + } + else if (ch == '"' || ch == '\\') + { + scm_putc ('\\', port); + scm_charprint (ch, port); + printed = 1; + } + else + if (uc_is_general_category_withtable + (ch, + UC_CATEGORY_MASK_L | UC_CATEGORY_MASK_M | + UC_CATEGORY_MASK_N | UC_CATEGORY_MASK_P | + UC_CATEGORY_MASK_S)) + { + /* Print the character since it is a graphic + character. */ + scm_t_wchar *wbuf; + SCM wstr = scm_i_make_wide_string (1, &wbuf); + char *buf; + size_t len; + + wbuf[0] = ch; + + buf = u32_conv_to_encoding ("ISO-8859-1", + iconveh_error, + (scm_t_uint32 *) wbuf, + 1, NULL, NULL, &len); + if (buf != NULL) + { + /* Character is graphic and representable in + this encoding. Print it. */ + scm_lfwrite_str (wstr, port); + free (buf); + printed = 1; + } + } + + if (!printed) + { + /* Character is graphic but unrepresentable in + this port's encoding or is not graphic. */ + if (ch <= 0xFF) + { + buf[0] = '\\'; + buf[1] = 'x'; + buf[2] = hex[ch / 16]; + buf[3] = hex[ch % 16]; + scm_lfwrite (buf, 4, port); + } + else if (ch <= 0xFFFF) + { + buf[0] = '\\'; + buf[1] = 'u'; + buf[2] = hex[(ch & 0xF000) >> 12]; + buf[3] = hex[(ch & 0xF00) >> 8]; + buf[4] = hex[(ch & 0xF0) >> 4]; + buf[5] = hex[(ch & 0xF)]; + scm_lfwrite (buf, 6, port); + j = i + 1; + } + else if (ch > 0xFFFF) + { + buf[0] = '\\'; + buf[1] = 'U'; + buf[2] = hex[(ch & 0xF00000) >> 20]; + buf[3] = hex[(ch & 0xF0000) >> 16]; + buf[4] = hex[(ch & 0xF000) >> 12]; + buf[5] = hex[(ch & 0xF00) >> 8]; + buf[6] = hex[(ch & 0xF0) >> 4]; + buf[7] = hex[(ch & 0xF)]; + scm_lfwrite (buf, 8, port); + j = i + 1; + } + } + } + scm_putc ('"', port); + scm_remember_upto_here_1 (exp); + } + else + scm_lfwrite (scm_i_string_chars (exp), scm_i_string_length (exp), + port); + scm_remember_upto_here_1 (exp); + break; case scm_tc7_symbol: if (scm_i_symbol_is_interned (exp)) { scm_print_symbol_name (scm_i_symbol_chars (exp), - scm_i_symbol_length (exp), - port); + scm_i_symbol_length (exp), port); scm_remember_upto_here_1 (exp); } else @@ -763,6 +821,17 @@ scm_prin1 (SCM exp, SCM port, int writingp) } } +/* Print a character. + */ +void +scm_charprint (scm_t_uint32 ch, SCM port) +{ + scm_t_wchar *wbuf; + SCM wstr = scm_i_make_wide_string (1, &wbuf); + + wbuf[0] = ch; + scm_lfwrite_str (wstr, port); +} /* Print an integer. */ diff --git a/libguile/print.h b/libguile/print.h index d817a6f..1df2952 100644 --- a/libguile/print.h +++ b/libguile/print.h @@ -77,6 +77,7 @@ SCM_API SCM scm_print_options (SCM setting); SCM_API SCM scm_make_print_state (void); SCM_API void scm_free_print_state (SCM print_state); SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state); +SCM_API void scm_charprint (scm_t_uint32 c, SCM port); SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port); SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port); SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port); diff --git a/libguile/read.c b/libguile/read.c index 2140fed..577a73e 100644 --- a/libguile/read.c +++ b/libguile/read.c @@ -387,110 +387,167 @@ scm_read_string (int chr, SCM port) object (the string returned). */ SCM str = SCM_BOOL_F; - char c_str[READER_STRING_BUFFER_SIZE]; unsigned c_str_len = 0; - int c; + scm_t_wchar c; + str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL); while ('"' != (c = scm_getc (port))) { if (c == EOF) - str_eof: scm_i_input_error (FUNC_NAME, port, - "end of file in string constant", - SCM_EOL); - - if (c_str_len + 1 >= sizeof (c_str)) - { - /* Flush the C buffer onto a Scheme string. */ - SCM addy; + { + str_eof: + scm_i_input_error (FUNC_NAME, port, + "end of file in string constant", SCM_EOL); + } - if (str == SCM_BOOL_F) - str = scm_c_make_string (0, SCM_MAKE_CHAR ('X')); + if (c_str_len + 1 >= scm_i_string_length (str)) + { + SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL); - addy = scm_from_locale_stringn (c_str, c_str_len); - str = scm_string_append_shared (scm_list_2 (str, addy)); - - c_str_len = 0; - } + str = scm_string_append (scm_list_2 (str, addy)); + } if (c == '\\') - switch (c = scm_getc (port)) - { - case EOF: - goto str_eof; - case '"': - case '\\': - break; + { + switch (c = scm_getc (port)) + { + case EOF: + goto str_eof; + case '"': + case '\\': + break; #if SCM_ENABLE_ELISP - case '(': - case ')': - if (SCM_ESCAPED_PARENS_P) - break; - goto bad_escaped; + case '(': + case ')': + if (SCM_ESCAPED_PARENS_P) + break; + goto bad_escaped; #endif - case '\n': - continue; - case '0': - c = '\0'; - break; - case 'f': - c = '\f'; - break; - case 'n': - c = '\n'; - break; - case 'r': - c = '\r'; - break; - case 't': - c = '\t'; - break; - case 'a': - c = '\007'; - break; - case 'v': - c = '\v'; - break; - case 'x': - { - int a, b; - a = scm_getc (port); - if (a == EOF) goto str_eof; - b = scm_getc (port); - if (b == EOF) goto str_eof; - if ('0' <= a && a <= '9') a -= '0'; - else if ('A' <= a && a <= 'F') a = a - 'A' + 10; - else if ('a' <= a && a <= 'f') a = a - 'a' + 10; - else goto bad_escaped; - if ('0' <= b && b <= '9') b -= '0'; - else if ('A' <= b && b <= 'F') b = b - 'A' + 10; - else if ('a' <= b && b <= 'f') b = b - 'a' + 10; - else goto bad_escaped; - c = a * 16 + b; - break; - } - default: - bad_escaped: - scm_i_input_error (FUNC_NAME, port, - "illegal character in escape sequence: ~S", - scm_list_1 (SCM_MAKE_CHAR (c))); - } - c_str[c_str_len++] = c; + case '\n': + continue; + case '0': + c = '\0'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'a': + c = '\007'; + break; + case 'v': + c = '\v'; + break; + case 'x': + { + scm_t_wchar a, b; + a = scm_getc (port); + if (a == EOF) + goto str_eof; + b = scm_getc (port); + if (b == EOF) + goto str_eof; + if ('0' <= a && a <= '9') + a -= '0'; + else if ('A' <= a && a <= 'F') + a = a - 'A' + 10; + else if ('a' <= a && a <= 'f') + a = a - 'a' + 10; + else + { + c = a; + goto bad_escaped; + } + if ('0' <= b && b <= '9') + b -= '0'; + else if ('A' <= b && b <= 'F') + b = b - 'A' + 10; + else if ('a' <= b && b <= 'f') + b = b - 'a' + 10; + else + { + c = b; + goto bad_escaped; + } + c = a * 16 + b; + break; + } + case 'u': + { + scm_t_wchar a; + int i; + c = 0; + for (i = 0; i < 4; i++) + { + a = scm_getc (port); + if (a == EOF) + goto str_eof; + if ('0' <= a && a <= '9') + a -= '0'; + else if ('A' <= a && a <= 'F') + a = a - 'A' + 10; + else if ('a' <= a && a <= 'f') + a = a - 'a' + 10; + else + { + c = a; + goto bad_escaped; + } + c = c * 16 + a; + } + break; + } + case 'U': + { + scm_t_wchar a; + int i; + c = 0; + for (i = 0; i < 6; i++) + { + a = scm_getc (port); + if (a == EOF) + goto str_eof; + if ('0' <= a && a <= '9') + a -= '0'; + else if ('A' <= a && a <= 'F') + a = a - 'A' + 10; + else if ('a' <= a && a <= 'f') + a = a - 'a' + 10; + else + { + c = a; + goto bad_escaped; + } + c = c * 16 + a; + } + break; + } + default: + bad_escaped: + scm_i_input_error (FUNC_NAME, port, + "illegal character in escape sequence: ~S", + scm_list_1 (SCM_MAKE_CHAR (c))); + } + } + str = scm_i_string_start_writing (str); + scm_i_string_set_x (str, c_str_len++, c); + scm_i_string_stop_writing (); } if (c_str_len > 0) { - SCM addy; - - addy = scm_from_locale_stringn (c_str, c_str_len); - if (str == SCM_BOOL_F) - str = addy; - else - str = scm_string_append_shared (scm_list_2 (str, addy)); + return scm_i_substring_copy (str, 0, c_str_len); } - else - str = (str == SCM_BOOL_F) ? scm_nullstr : str; - - return str; + + return scm_nullstr; } #undef FUNC_NAME diff --git a/libguile/rw.c b/libguile/rw.c index cb62b79..a9b4a32 100644 --- a/libguile/rw.c +++ b/libguile/rw.c @@ -131,6 +131,7 @@ SCM_DEFINE (scm_read_string_x_partial, "read-string!/partial", 1, 3, 0, don't touch the file descriptor. otherwise the "return immediately if something is available" rule may be violated. */ + str = scm_i_string_start_writing (str); dest = scm_i_string_writable_chars (str) + offset; chars_read = scm_take_from_input_buffers (port, dest, read_len); scm_i_string_stop_writing (); @@ -140,6 +141,7 @@ SCM_DEFINE (scm_read_string_x_partial, "read-string!/partial", 1, 3, 0, if (chars_read == 0 && read_len > 0) /* don't confuse read_len == 0 with EOF. */ { + str = scm_i_string_start_writing (str); dest = scm_i_string_writable_chars (str) + offset; SCM_SYSCALL (chars_read = read (fdes, dest, read_len)); scm_i_string_stop_writing (); diff --git a/libguile/socket.c b/libguile/socket.c index 553a1a1..2e02e90 100644 --- a/libguile/socket.c +++ b/libguile/socket.c @@ -1438,6 +1438,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0, fd = SCM_FPORT_FDES (sock); len = scm_i_string_length (buf); + buf = scm_i_string_start_writing (buf); dest = scm_i_string_writable_chars (buf); SCM_SYSCALL (rv = recv (fd, dest, len, flg)); scm_i_string_stop_writing (); @@ -1482,6 +1483,7 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0, fd = SCM_FPORT_FDES (sock); len = scm_i_string_length (message); + message = scm_i_string_start_writing (message); src = scm_i_string_writable_chars (message); SCM_SYSCALL (rv = send (fd, src, len, flg)); scm_i_string_stop_writing (); @@ -1550,6 +1552,7 @@ SCM_DEFINE (scm_recvfrom, "recvfrom!", 2, 3, 0, /* recvfrom will not necessarily return an address. usually nothing is returned for stream sockets. */ + str = scm_i_string_start_writing (str); buf = scm_i_string_writable_chars (str); ((struct sockaddr *) &addr)->sa_family = AF_UNSPEC; SCM_SYSCALL (rv = recvfrom (fd, buf + offset, diff --git a/libguile/srfi-13.c b/libguile/srfi-13.c index f3863d3..a66ede8 100644 --- a/libguile/srfi-13.c +++ b/libguile/srfi-13.c @@ -549,6 +549,7 @@ SCM_DEFINE (scm_string_copy_x, "string-copy!", 3, 2, 0, len = cend - cstart; SCM_ASSERT_RANGE (3, s, len <= scm_i_string_length (target) - ctstart); + target = scm_i_string_start_writing (target); ctarget = scm_i_string_writable_chars (target); memmove (ctarget + ctstart, cstr + cstart, len); scm_i_string_stop_writing (); @@ -985,6 +986,7 @@ SCM_DEFINE (scm_substring_fill_x, "string-fill!", 2, 2, 0, 4, end, cend); SCM_VALIDATE_CHAR_COPY (2, chr, c); + str = scm_i_string_start_writing (str); cstr = scm_i_string_writable_chars (str); for (k = cstart; k < cend; k++) cstr[k] = c; @@ -2376,6 +2378,7 @@ string_upcase_x (SCM v, size_t start, size_t end) size_t k; char *dst; + v = scm_i_string_start_writing (v); dst = scm_i_string_writable_chars (v); for (k = start; k < end; ++k) dst[k] = scm_c_upcase (dst[k]); @@ -2442,6 +2445,7 @@ string_downcase_x (SCM v, size_t start, size_t end) size_t k; char *dst; + v = scm_i_string_start_writing (v); dst = scm_i_string_writable_chars (v); for (k = start; k < end; ++k) dst[k] = scm_c_downcase (dst[k]); @@ -2511,6 +2515,7 @@ string_titlecase_x (SCM str, size_t start, size_t end) size_t i; int in_word = 0; + str = scm_i_string_start_writing (str); sz = (unsigned char *) scm_i_string_writable_chars (str); for(i = start; i < end; i++) { @@ -2635,6 +2640,7 @@ SCM_DEFINE (scm_string_reverse, "string-reverse", 1, 2, 0, 2, start, cstart, 3, end, cend); result = scm_string_copy (str); + result = scm_i_string_start_writing (result); ctarget = scm_i_string_writable_chars (result); string_reverse_x (ctarget, cstart, cend); scm_i_string_stop_writing (); @@ -2658,6 +2664,7 @@ SCM_DEFINE (scm_string_reverse_x, "string-reverse!", 1, 2, 0, 2, start, cstart, 3, end, cend); + str = scm_i_string_start_writing (str); cstr = scm_i_string_writable_chars (str); string_reverse_x (cstr, cstart, cend); scm_i_string_stop_writing (); @@ -3018,19 +3025,16 @@ SCM_DEFINE (scm_string_for_each, "string-for-each", 2, 2, 0, "return value is not specified.") #define FUNC_NAME s_scm_string_for_each { - const char *cstr; size_t cstart, cend; scm_t_trampoline_1 proc_tramp = scm_trampoline_1 (proc); SCM_ASSERT (proc_tramp, proc, SCM_ARG1, FUNC_NAME); - MY_VALIDATE_SUBSTRING_SPEC_COPY (2, s, cstr, - 3, start, cstart, - 4, end, cend); + MY_VALIDATE_SUBSTRING_SPEC (2, s, + 3, start, cstart, + 4, end, cend); while (cstart < cend) { - unsigned int c = (unsigned char) cstr[cstart]; - proc_tramp (proc, SCM_MAKE_CHAR (c)); - cstr = scm_i_string_chars (s); + proc_tramp (proc, SCM_MAKE_CHAR (scm_i_string_ref (s, cstart))); cstart++; } @@ -3162,6 +3166,7 @@ SCM_DEFINE (scm_string_xcopy_x, "string-xcopy!", 4, 3, 0, SCM_ASSERT_RANGE (1, tstart, ctstart + (csto - csfrom) <= scm_i_string_length (target)); + target = scm_i_string_start_writing (target); p = scm_i_string_writable_chars (target) + ctstart; cs = scm_i_string_chars (s); while (csfrom < csto) diff --git a/libguile/strings.c b/libguile/strings.c index 4e21f3e..e2fc664 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -24,6 +24,8 @@ #include #include +#include +#include #include "libguile/_scm.h" #include "libguile/chars.h" @@ -69,10 +71,12 @@ #define STRINGBUF_F_SHARED 0x100 #define STRINGBUF_F_INLINE 0x200 +#define STRINGBUF_F_WIDE 0x400 #define STRINGBUF_TAG scm_tc7_stringbuf #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED) #define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE) +#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE) #define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf)) #define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf)) @@ -82,6 +86,7 @@ #define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \ ? STRINGBUF_INLINE_CHARS (buf) \ : STRINGBUF_OUTLINE_CHARS (buf)) +#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf)) #define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \ ? STRINGBUF_INLINE_LENGTH (buf) \ : STRINGBUF_OUTLINE_LENGTH (buf)) @@ -126,6 +131,23 @@ make_stringbuf (size_t len) } } +static SCM +make_wide_stringbuf (size_t len) +{ + scm_t_wchar *mem; +#if SCM_DEBUG + if (len < 1000) + lenhist[len]++; + else + lenhist[1000]++; +#endif + + mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string"); + mem[len] = 0; + return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem, + (scm_t_bits) len, (scm_t_bits) 0); +} + /* Return a new stringbuf whose underlying storage consists of the LEN+1 octets pointed to by STR (the last octet is zero). */ SCM @@ -147,8 +169,58 @@ void scm_i_stringbuf_free (SCM buf) { if (!STRINGBUF_INLINE (buf)) - scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), - STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string"); + { + if (!STRINGBUF_WIDE (buf)) + scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), + STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string"); + else + scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), + sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf) + + 1), "string"); + } + +} + +static void +widen_stringbuf (SCM buf) +{ + size_t i, len; + scm_t_wchar *mem; + + if (STRINGBUF_WIDE (buf)) + return; + + if (STRINGBUF_INLINE (buf)) + { + len = STRINGBUF_INLINE_LENGTH (buf); + + mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string"); + for (i = 0; i < len; i++) + mem[i] = + (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i]; + mem[len] = 0; + + SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE); + SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE); + SCM_SET_CELL_WORD_1 (buf, mem); + SCM_SET_CELL_WORD_2 (buf, len); + } + else + { + len = STRINGBUF_OUTLINE_LENGTH (buf); + + mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string"); + for (i = 0; i < len; i++) + mem[i] = + (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i]; + mem[len] = 0; + + scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string"); + + SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE); + SCM_SET_CELL_WORD_1 (buf, mem); + SCM_SET_CELL_WORD_2 (buf, len); + } } scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER; @@ -195,6 +267,18 @@ scm_i_make_string (size_t len, char **charsp) return res; } +SCM +scm_i_make_wide_string (size_t len, scm_t_wchar ** charsp) +{ + SCM buf = make_wide_stringbuf (len); + SCM res; + if (charsp) + *charsp = STRINGBUF_WIDE_CHARS (buf); + res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf), + (scm_t_bits) 0, (scm_t_bits) len); + return res; +} + static void validate_substring_args (SCM str, size_t start, size_t end) { @@ -253,12 +337,24 @@ scm_i_substring_copy (SCM str, size_t start, size_t end) SCM buf, my_buf; size_t str_start; get_str_buf_start (&str, &buf, &str_start); - my_buf = make_stringbuf (len); - memcpy (STRINGBUF_CHARS (my_buf), - STRINGBUF_CHARS (buf) + str_start + start, len); + if (scm_i_is_narrow_string (str)) + { + my_buf = make_stringbuf (len); + memcpy (STRINGBUF_CHARS (my_buf), + STRINGBUF_CHARS (buf) + str_start + start, len); + } + else + { + my_buf = make_wide_stringbuf (len); + u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf), + (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + str_start + + start, len); + /* Even though this string is wide, the substring may be narrow. + Consider adding code to narrow string. */ + } scm_remember_upto_here_1 (buf); - return scm_double_cell (STRING_TAG, SCM_UNPACK(my_buf), - (scm_t_bits)0, (scm_t_bits) len); + return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf), + (scm_t_bits) 0, (scm_t_bits) len); } SCM @@ -330,17 +426,42 @@ scm_i_string_length (SCM str) return STRING_LENGTH (str); } +int +scm_i_is_narrow_string (SCM str) +{ + return !STRINGBUF_WIDE (STRING_STRINGBUF (str)); +} + const char * scm_i_string_chars (SCM str) { SCM buf; size_t start; get_str_buf_start (&str, &buf, &start); - return STRINGBUF_CHARS (buf) + start; + if (scm_i_is_narrow_string (str)) + return STRINGBUF_CHARS (buf) + start; + else + scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s", + scm_list_1 (str)); + return NULL; } -char * -scm_i_string_writable_chars (SCM orig_str) +const scm_t_wchar * +scm_i_string_wide_chars (SCM str) +{ + SCM buf; + size_t start; + + get_str_buf_start (&str, &buf, &start); + if (!scm_i_is_narrow_string (str)) + return STRINGBUF_WIDE_CHARS (buf) + start; + else + scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s", + scm_list_1 (str)); +} + +SCM +scm_i_string_start_writing (SCM orig_str) { SCM buf, str = orig_str; size_t start; @@ -352,18 +473,26 @@ scm_i_string_writable_chars (SCM orig_str) scm_i_pthread_mutex_lock (&stringbuf_write_mutex); if (STRINGBUF_SHARED (buf)) { - /* Clone stringbuf. For this, we put all threads to sleep. - */ - + /* Clone the stringbuf. */ size_t len = STRING_LENGTH (str); SCM new_buf; scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); - new_buf = make_stringbuf (len); - memcpy (STRINGBUF_CHARS (new_buf), - STRINGBUF_CHARS (buf) + STRING_START (str), len); - + if (scm_i_is_narrow_string (str)) + { + new_buf = make_stringbuf (len); + memcpy (STRINGBUF_CHARS (new_buf), + STRINGBUF_CHARS (buf) + STRING_START (str), len); + + } + else + { + new_buf = make_wide_stringbuf (len); + u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf), + (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + + STRING_START (str), len); + } scm_i_thread_put_to_sleep (); SET_STRING_STRINGBUF (str, new_buf); start -= STRING_START (str); @@ -374,8 +503,36 @@ scm_i_string_writable_chars (SCM orig_str) scm_i_pthread_mutex_lock (&stringbuf_write_mutex); } + return orig_str; +} + +char * +scm_i_string_writable_chars (SCM str) +{ + SCM buf; + size_t start; - return STRINGBUF_CHARS (buf) + start; + get_str_buf_start (&str, &buf, &start); + if (scm_i_is_narrow_string (str)) + return STRINGBUF_CHARS (buf) + start; + else + scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s", + scm_list_1 (str)); + return NULL; +} + +static scm_t_wchar * +scm_i_string_writable_wide_chars (SCM str) +{ + SCM buf; + size_t start; + + get_str_buf_start (&str, &buf, &start); + if (!scm_i_is_narrow_string (str)) + return STRINGBUF_WIDE_CHARS (buf) + start; + else + scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s", + scm_list_1 (str)); } void @@ -384,6 +541,34 @@ scm_i_string_stop_writing (void) scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); } +/* Return the Xth character is C. */ +scm_t_wchar +scm_i_string_ref (SCM str, size_t x) +{ + if (scm_i_is_narrow_string (str)) + return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]); + else + return scm_i_string_wide_chars (str)[x]; +} + +void +scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr) +{ + if (chr > 0xFF && scm_i_is_narrow_string (str)) + widen_stringbuf (STRING_STRINGBUF (str)); + + if (scm_i_is_narrow_string (str)) + { + char *dst = scm_i_string_writable_chars (str); + dst[p] = (char) (unsigned char) chr; + } + else + { + scm_t_wchar *dst = scm_i_string_writable_wide_chars (str); + dst[p] = chr; + } +} + /* Symbols. Basic symbol creation and accessing is done here, the rest is in @@ -418,10 +603,21 @@ scm_i_make_symbol (SCM name, scm_t_bits flags, else { /* make new buf. */ - SCM new_buf = make_stringbuf (length); - memcpy (STRINGBUF_CHARS (new_buf), - STRINGBUF_CHARS (buf) + start, length); - buf = new_buf; + if (scm_i_is_narrow_string (name)) + { + SCM new_buf = make_stringbuf (length); + memcpy (STRINGBUF_CHARS (new_buf), + STRINGBUF_CHARS (buf) + start, length); + buf = new_buf; + } + else + { + SCM new_buf = make_wide_stringbuf (length); + u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf), + (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start, + length); + buf = new_buf; + } } return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf), (scm_t_bits) hash, SCM_UNPACK (props)); @@ -466,11 +662,39 @@ scm_c_symbol_length (SCM sym) } #undef FUNC_NAME +int +scm_i_is_narrow_symbol (SCM sym) +{ + SCM buf; + + buf = SYMBOL_STRINGBUF (sym); + return !STRINGBUF_WIDE (buf); +} + const char * scm_i_symbol_chars (SCM sym) { - SCM buf = SYMBOL_STRINGBUF (sym); - return STRINGBUF_CHARS (buf); + SCM buf; + + buf = SYMBOL_STRINGBUF (sym); + if (!STRINGBUF_WIDE (buf)) + return STRINGBUF_CHARS (buf); + else + scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S", + scm_list_1 (sym)); +} + +const scm_t_wchar * +scm_i_symbol_wide_chars (SCM sym) +{ + SCM buf; + + buf = SYMBOL_STRINGBUF (sym); + if (STRINGBUF_WIDE (buf)) + return STRINGBUF_WIDE_CHARS (buf); + else + scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S", + scm_list_1 (sym)); } SCM @@ -496,6 +720,15 @@ scm_i_symbol_substring (SCM sym, size_t start, size_t end) (scm_t_bits)start, (scm_t_bits) end - start); } +scm_t_wchar +scm_i_symbol_ref (SCM sym, size_t x) +{ + if (scm_i_is_narrow_symbol (sym)) + return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]); + else + return scm_i_symbol_wide_chars (sym)[x]; +} + /* Debugging */ @@ -505,15 +738,17 @@ SCM scm_sys_string_dump (SCM); SCM scm_sys_symbol_dump (SCM); SCM scm_sys_stringbuf_hist (void); -SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, - (SCM str), - "") +SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str), "") #define FUNC_NAME s_scm_sys_string_dump { SCM_VALIDATE_STRING (1, str); fprintf (stderr, "%p:\n", str); fprintf (stderr, " start: %u\n", STRING_START (str)); fprintf (stderr, " len: %u\n", STRING_LENGTH (str)); + if (scm_i_is_narrow_string (str)) + fprintf (stderr, " format: narrow\n"); + else + fprintf (stderr, " format: wide\n"); if (IS_SH_STRING (str)) { fprintf (stderr, " string: %p\n", SH_STRING_STRING (str)); @@ -524,36 +759,54 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, { SCM buf = STRING_STRINGBUF (str); fprintf (stderr, " buf: %p\n", buf); - fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf)); + if (scm_i_is_narrow_string (str)) + fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf)); + else + fprintf (stderr, " chars: %p\n", STRINGBUF_WIDE_CHARS (buf)); fprintf (stderr, " length: %u\n", STRINGBUF_LENGTH (buf)); - fprintf (stderr, " flags: %x\n", (SCM_CELL_WORD_0 (buf) & 0x300)); + if (STRINGBUF_SHARED (buf)) + fprintf (stderr, " shared: true\n"); + else + fprintf (stderr, " shared: false\n"); + if (STRINGBUF_INLINE (buf)) + fprintf (stderr, " inline: true\n"); + else + fprintf (stderr, " inline: false\n"); + } return SCM_UNSPECIFIED; } #undef FUNC_NAME -SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, - (SCM sym), - "") +SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), "") #define FUNC_NAME s_scm_sys_symbol_dump { SCM_VALIDATE_SYMBOL (1, sym); fprintf (stderr, "%p:\n", sym); fprintf (stderr, " hash: %lu\n", scm_i_symbol_hash (sym)); + if (scm_i_is_narrow_symbol (sym)) + fprintf (stderr, " format: narrow\n"); + else + fprintf (stderr, " format: wide\n"); { SCM buf = SYMBOL_STRINGBUF (sym); fprintf (stderr, " buf: %p\n", buf); - fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf)); + if (scm_i_is_narrow_symbol (sym)) + fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf)); + else + fprintf (stderr, " chars: %p\n", STRINGBUF_WIDE_CHARS (buf)); fprintf (stderr, " length: %u\n", STRINGBUF_LENGTH (buf)); - fprintf (stderr, " shared: %u\n", STRINGBUF_SHARED (buf)); + if (STRINGBUF_SHARED (buf)) + fprintf (stderr, " shared: true\n"); + else + fprintf (stderr, " shared: false\n"); + } return SCM_UNSPECIFIED; } #undef FUNC_NAME -SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, - (void), - "") +SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "") #define FUNC_NAME s_scm_sys_stringbuf_hist { int i; @@ -589,29 +842,46 @@ SCM_DEFINE (scm_string, "string", 0, 0, 1, #define FUNC_NAME s_scm_string { SCM result; + SCM rest; size_t len; - char *data; - - { - long i = scm_ilength (chrs); + size_t p = 0; + long i; - SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME); - len = i; - } + /* Verify that this is a list of chars. */ + i = scm_ilength (chrs); + len = (size_t) i; + rest = chrs; - result = scm_i_make_string (len, &data); - while (len > 0 && scm_is_pair (chrs)) + SCM_ASSERT (len >= 0, chrs, SCM_ARG1, FUNC_NAME); + while (len > 0 && scm_is_pair (rest)) { - SCM elt = SCM_CAR (chrs); - + SCM elt = SCM_CAR (rest); SCM_VALIDATE_CHAR (SCM_ARGn, elt); - *data++ = SCM_CHAR (elt); - chrs = SCM_CDR (chrs); + rest = SCM_CDR (rest); + len--; + scm_remember_upto_here_1 (elt); + } + + /* Construct a string containing this list of chars. */ + len = (size_t) i; + rest = chrs; + + result = scm_i_make_string (len, NULL); + result = scm_i_string_start_writing (result); + while (len > 0 && scm_is_pair (rest)) + { + SCM elt = SCM_CAR (rest); + scm_i_string_set_x (result, p, SCM_CHAR (elt)); + p++; + rest = SCM_CDR (rest); len--; + scm_remember_upto_here_1 (elt); } + scm_i_string_stop_writing (); + if (len > 0) scm_misc_error (NULL, "list changed while constructing string", SCM_EOL); - if (!scm_is_null (chrs)) + if (!scm_is_null (rest)) scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list"); return result; @@ -634,13 +904,16 @@ SCM scm_c_make_string (size_t len, SCM chr) #define FUNC_NAME NULL { - char *dst; - SCM res = scm_i_make_string (len, &dst); + size_t p; + SCM res = scm_i_make_string (len, NULL); if (!SCM_UNBNDP (chr)) { SCM_VALIDATE_CHAR (0, chr); - memset (dst, SCM_CHAR (chr), len); + res = scm_i_string_start_writing (res); + for (p = 0; p < len; p++) + scm_i_string_set_x (res, p, SCM_CHAR (chr)); + scm_i_string_stop_writing (); } return res; @@ -657,6 +930,20 @@ SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0, } #undef FUNC_NAME +SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0, + (SCM string), + "Return the bytes used to represent a character in @var{string}." + "This will return 1 or 4.") +#define FUNC_NAME s_scm_string_width +{ + SCM_VALIDATE_STRING (1, string); + if (!scm_i_is_narrow_string (string)) + return scm_from_int (4); + + return scm_from_int (1); +} +#undef FUNC_NAME + size_t scm_c_string_length (SCM string) { @@ -667,8 +954,8 @@ scm_c_string_length (SCM string) SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0, (SCM str, SCM k), - "Return character @var{k} of @var{str} using zero-origin\n" - "indexing. @var{k} must be a valid index of @var{str}.") + "Return character @var{k} of @var{str} using zero-origin\n" + "indexing. @var{k} must be a valid index of @var{str}.") #define FUNC_NAME s_scm_string_ref { size_t len; @@ -682,7 +969,10 @@ SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0, else scm_out_of_range (NULL, k); - return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]); + if (scm_i_is_narrow_string (str)) + return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]); + else + return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]); } #undef FUNC_NAME @@ -691,14 +981,18 @@ scm_c_string_ref (SCM str, size_t p) { if (p >= scm_i_string_length (str)) scm_out_of_range (NULL, scm_from_size_t (p)); - return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]); + if (scm_i_is_narrow_string (str)) + return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]); + else + return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]); + } SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0, (SCM str, SCM k, SCM chr), - "Store @var{chr} in element @var{k} of @var{str} and return\n" - "an unspecified value. @var{k} must be a valid index of\n" - "@var{str}.") + "Store @var{chr} in element @var{k} of @var{str} and return\n" + "an unspecified value. @var{k} must be a valid index of\n" + "@var{str}.") #define FUNC_NAME s_scm_string_set_x { size_t len; @@ -713,11 +1007,10 @@ SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0, scm_out_of_range (NULL, k); SCM_VALIDATE_CHAR (3, chr); - { - char *dst = scm_i_string_writable_chars (str); - dst[idx] = SCM_CHAR (chr); - scm_i_string_stop_writing (); - } + str = scm_i_string_start_writing (str); + scm_i_string_set_x (str, idx, SCM_CHAR (chr)); + scm_i_string_stop_writing (); + return SCM_UNSPECIFIED; } #undef FUNC_NAME @@ -727,11 +1020,9 @@ scm_c_string_set_x (SCM str, size_t p, SCM chr) { if (p >= scm_i_string_length (str)) scm_out_of_range (NULL, scm_from_size_t (p)); - { - char *dst = scm_i_string_writable_chars (str); - dst[p] = SCM_CHAR (chr); - scm_i_string_stop_writing (); - } + str = scm_i_string_start_writing (str); + scm_i_string_set_x (str, p, SCM_CHAR (chr)); + scm_i_string_stop_writing (); } SCM_DEFINE (scm_substring, "substring", 2, 1, 0, @@ -832,31 +1123,55 @@ SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0, SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1, (SCM args), - "Return a newly allocated string whose characters form the\n" + "Return a newly allocated string whose characters form the\n" "concatenation of the given strings, @var{args}.") #define FUNC_NAME s_scm_string_append { SCM res; - size_t i = 0; + size_t len = 0; + int wide = 0; SCM l, s; char *data; + scm_t_wchar *wdata; + int i; SCM_VALIDATE_REST_ARGUMENT (args); - for (l = args; !scm_is_null (l); l = SCM_CDR (l)) + for (l = args; !scm_is_null (l); l = SCM_CDR (l)) { s = SCM_CAR (l); SCM_VALIDATE_STRING (SCM_ARGn, s); - i += scm_i_string_length (s); + len += scm_i_string_length (s); + if (!scm_i_is_narrow_string (s)) + wide = 1; } - res = scm_i_make_string (i, &data); - for (l = args; !scm_is_null (l); l = SCM_CDR (l)) + if (!wide) + res = scm_i_make_string (len, &data); + else + res = scm_i_make_wide_string (len, &wdata); + + for (l = args; !scm_is_null (l); l = SCM_CDR (l)) { size_t len; s = SCM_CAR (l); SCM_VALIDATE_STRING (SCM_ARGn, s); len = scm_i_string_length (s); - memcpy (data, scm_i_string_chars (s), len); - data += len; + if (!wide) + { + memcpy (data, scm_i_string_chars (s), len); + data += len; + } + else + { + if (scm_i_is_narrow_string (s)) + { + for (i = 0; i < scm_i_string_length (s); i++) + wdata[i] = (unsigned char) scm_i_string_chars (s)[i]; + } + else + u32_cpy ((scm_t_uint32 *) wdata, + (scm_t_uint32 *) scm_i_string_wide_chars (s), len); + wdata += len; + } scm_remember_upto_here_1 (s); } return res; @@ -875,8 +1190,11 @@ scm_from_locale_stringn (const char *str, size_t len) SCM res; char *dst; - if (len == (size_t)-1) + if (len == (size_t) -1) len = strlen (str); + if (len == 0) + return scm_nullstr; + res = scm_i_make_string (len, &dst); memcpy (dst, str, len); return res; @@ -885,6 +1203,9 @@ scm_from_locale_stringn (const char *str, size_t len) SCM scm_from_locale_string (const char *str) { + if (str == NULL) + return scm_nullstr; + return scm_from_locale_stringn (str, -1); } @@ -893,21 +1214,20 @@ scm_take_locale_stringn (char *str, size_t len) { SCM buf, res; - if (len == (size_t)-1) + if (len == (size_t) -1) len = strlen (str); else { /* Ensure STR is null terminated. A realloc for 1 extra byte should often be satisfied from the alignment padding after the block, with no actual data movement. */ - str = scm_realloc (str, len+1); + str = scm_realloc (str, len + 1); str[len] = '\0'; } buf = scm_i_take_stringbufn (str, len); res = scm_double_cell (STRING_TAG, - SCM_UNPACK (buf), - (scm_t_bits) 0, (scm_t_bits) len); + SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len); return res; } @@ -917,33 +1237,140 @@ scm_take_locale_string (char *str) return scm_take_locale_stringn (str, -1); } +static void +unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp) +{ + char *before, *after; + size_t i, j; + + before = *bufp; + after = *bufp; + i = 0; + j = 0; + while (i < *lenp) + { + if ((i <= *lenp - 6) + && before[i] == '\\' + && before[i + 1] == 'u' + && before[i + 2] == '0' && before[i + 3] == '0') + { + /* Convert \u00NN to \xNN */ + after[j] = '\\'; + after[j + 1] = 'x'; + after[j + 2] = tolower (before[i + 4]); + after[j + 3] = tolower (before[i + 5]); + i += 6; + j += 4; + } + else if ((i <= *lenp - 10) + && before[i] == '\\' + && before[i + 1] == 'U' + && before[i + 2] == '0' && before[i + 3] == '0') + { + /* Convert \U00NNNNNN to \UNNNNNN */ + after[j] = '\\'; + after[j + 1] = 'U'; + after[j + 2] = tolower (before[i + 4]); + after[j + 3] = tolower (before[i + 5]); + after[j + 4] = tolower (before[i + 6]); + after[j + 5] = tolower (before[i + 7]); + after[j + 6] = tolower (before[i + 8]); + after[j + 7] = tolower (before[i + 9]); + i += 10; + j += 8; + } + else + { + after[j] = before[i]; + i++; + j++; + } + } + *lenp = j; + after = scm_realloc (after, j); +} + char * -scm_to_locale_stringn (SCM str, size_t *lenp) +scm_to_locale_stringn (SCM str, size_t * lenp) { - char *res; - size_t len; + const char *enc; + + /* In the future, enc will hold the port's encoding. */ + enc = NULL; + + return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence); +} + +char * +scm_to_stringn (SCM str, size_t * lenp, const char *encoding, + enum iconv_ilseq_handler handler) +{ + static const char iso[11] = "ISO-8859-1"; + char *buf; + size_t ilen, len, i; if (!scm_is_string (str)) scm_wrong_type_arg_msg (NULL, 0, str, "string"); - len = scm_i_string_length (str); - res = scm_malloc (len + ((lenp==NULL)? 1 : 0)); - memcpy (res, scm_i_string_chars (str), len); + ilen = scm_i_string_length (str); + + if (ilen == 0) + { + buf = scm_malloc (1); + buf[0] = '\0'; + if (lenp) + *lenp = 0; + return buf; + } + if (lenp == NULL) + for (i = 0; i < ilen; i++) + if (scm_i_string_ref (str, i) == '\0') + scm_misc_error (NULL, + "string contains #\\nul character: ~S", + scm_list_1 (str)); + + if (scm_i_is_narrow_string (str)) { - res[len] = '\0'; - if (strlen (res) != len) - { - free (res); - scm_misc_error (NULL, - "string contains #\\nul character: ~S", - scm_list_1 (str)); - } + if (lenp) + { + buf = scm_malloc (ilen); + memcpy (buf, scm_i_string_chars (str), ilen); + *lenp = ilen; + return buf; + } + else + { + buf = scm_malloc (ilen + 1); + memcpy (buf, scm_i_string_chars (str), ilen); + buf[ilen] = '\0'; + return buf; + } } - else + + + buf = NULL; + len = 0; + buf = u32_conv_to_encoding (iso, + handler, + (scm_t_uint32 *) scm_i_string_wide_chars (str), + ilen, NULL, NULL, &len); + if (buf == NULL) + scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"", + scm_list_2 (scm_from_locale_string (iso), str)); + + if (handler == iconveh_escape_sequence) + unistring_escapes_to_guile_escapes (&buf, &len); + + if (lenp) *lenp = len; + else + { + buf = scm_realloc (buf, len + 1); + buf[len] = '\0'; + } scm_remember_upto_here_1 (str); - return res; + return buf; } char * @@ -956,18 +1383,21 @@ size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len) { size_t len; - + char *result = NULL; if (!scm_is_string (str)) scm_wrong_type_arg_msg (NULL, 0, str, "string"); - len = scm_i_string_length (str); - memcpy (buf, scm_i_string_chars (str), (len > max_len)? max_len : len); + result = scm_to_locale_stringn (str, &len); + + memcpy (buf, result, (len > max_len) ? max_len : len); + free (result); + scm_remember_upto_here_1 (str); return len; } /* converts C scm_array of strings to SCM scm_list of strings. */ /* If argc < 0, a null terminated scm_array is assumed. */ -SCM +SCM scm_makfromstrs (int argc, char **argv) { int i = argc; @@ -1081,6 +1511,7 @@ scm_i_deprecated_string_chars (SCM str) /* The following is still wrong, of course... */ + str = scm_i_string_start_writing (str); chars = scm_i_string_writable_chars (str); scm_i_string_stop_writing (); return chars; diff --git a/libguile/strings.h b/libguile/strings.h index 9e028d8..5c09d58 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -23,6 +23,7 @@ +#include #include "libguile/__scm.h" @@ -46,26 +47,37 @@ Internal, low level interface to the character arrays - - Use scm_i_string_chars to get a pointer to the byte array of a - string for reading. Use scm_i_string_length to get the number of - bytes in that array. The array is not null-terminated. + - Use scm_is_narrow_string to determine is the string is narrow or + wide. + + - Use scm_i_string_chars or scm_i_string_wide_chars to get a + pointer to the byte or scm_t_wchar array of a string for reading. + Use scm_i_string_length to get the number of characters in that + array. The array is not null-terminated. - The array is valid as long as the corresponding SCM object is protected but only until the next SCM_TICK. During such a 'safe point', strings might change their representation. - - Use scm_i_string_writable_chars to get the same pointer as with - scm_i_string_chars, but for reading and writing. This is a - potentially costly operation since it implements the - copy-on-write behavior. When done with the writing, call - scm_i_string_stop_writing. You must do this before the next - SCM_TICK. (This means, before calling almost any other scm_ - function and you can't allow throws, of course.) - - - New strings can be created with scm_i_make_string. This gives - access to a writable pointer that remains valid as long as nobody - else makes a copy-on-write substring of the string. Do not call - scm_i_string_stop_writing for this pointer. + - Use scm_i_string_start_writing to get a version of the string + ready for reading and writing. This is a potentially costly + operation since it implements the copy-on-write behavior. When + done with the writing, call scm_i_string_stop_writing. You must + do this before the next SCM_TICK. (This means, before calling + almost any other scm_ function and you can't allow throws, of + course.) + + - New strings can be created with scm_i_make_string or + scm_i_make_wide_string. This gives access to a writable pointer + that remains valid as long as nobody else makes a copy-on-write + substring of the string. Do not call scm_i_string_stop_writing + for this pointer. + + - Alternately, scm_i_string_ref and scm_i_string_set_x can be used + to read and write strings without worrying about whether the + string is narrow or wide. scm_i_string_set_x still needs to be + bracketed by scm_i_string_start_writing and + scm_i_string_stop_writing. Legacy interface @@ -74,13 +86,15 @@ - SCM_STRING_CHARS uses scm_i_string_writable_chars and immediately calls scm_i_stop_writing, hoping for the best. SCM_STRING_LENGTH is the same as scm_i_string_length. SCM_STRING_CHARS will throw - an error for for strings that are not null-terminated. + an error for for strings that are not null-terminated. There is + no wide version of this interface. */ SCM_API SCM scm_string_p (SCM x); SCM_API SCM scm_string (SCM chrs); SCM_API SCM scm_make_string (SCM k, SCM chr); SCM_API SCM scm_string_length (SCM str); +SCM_API SCM scm_string_width (SCM str); SCM_API SCM scm_string_ref (SCM str, SCM k); SCM_API SCM scm_string_set_x (SCM str, SCM k, SCM chr); SCM_API SCM scm_substring (SCM str, SCM start, SCM end); @@ -106,6 +120,9 @@ SCM_API SCM scm_take_locale_string (char *str); SCM_API SCM scm_take_locale_stringn (char *str, size_t len); SCM_API char *scm_to_locale_string (SCM str); SCM_API char *scm_to_locale_stringn (SCM str, size_t *lenp); +SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp, + const char *encoding, + enum iconv_ilseq_handler handler); SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len); SCM_API SCM scm_makfromstrs (int argc, char **argv); @@ -113,15 +130,20 @@ SCM_API SCM scm_makfromstrs (int argc, char **argv); /* internal accessor functions. Arguments must be valid. */ SCM_INTERNAL SCM scm_i_make_string (size_t len, char **datap); +SCM_INTERNAL SCM scm_i_make_wide_string (size_t len, scm_t_wchar **datap); SCM_INTERNAL SCM scm_i_substring (SCM str, size_t start, size_t end); SCM_INTERNAL SCM scm_i_substring_read_only (SCM str, size_t start, size_t end); SCM_INTERNAL SCM scm_i_substring_shared (SCM str, size_t start, size_t end); SCM_INTERNAL SCM scm_i_substring_copy (SCM str, size_t start, size_t end); SCM_INTERNAL size_t scm_i_string_length (SCM str); SCM_API /* FIXME: not internal */ const char *scm_i_string_chars (SCM str); +SCM_API const scm_t_wchar *scm_i_string_wide_chars (SCM str); SCM_API /* FIXME: not internal */ char *scm_i_string_writable_chars (SCM str); +SCM_INTERNAL SCM scm_i_string_start_writing (SCM str); SCM_INTERNAL void scm_i_string_stop_writing (void); - +SCM_INTERNAL int scm_i_is_narrow_string (SCM str); +SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x); +SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr); /* internal functions related to symbols. */ SCM_INTERNAL SCM scm_i_make_symbol (SCM name, scm_t_bits flags, @@ -133,8 +155,11 @@ SCM_INTERNAL SCM scm_i_c_take_symbol (char *name, size_t len, scm_t_bits flags, unsigned long hash, SCM props); SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym); +SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym); SCM_INTERNAL size_t scm_i_symbol_length (SCM sym); +SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str); SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end); +SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x); /* internal GC functions. */ diff --git a/libguile/vm-engine.h b/libguile/vm-engine.h index c0f772f..240969c 100644 --- a/libguile/vm-engine.h +++ b/libguile/vm-engine.h @@ -336,6 +336,7 @@ do { \ #define FETCH() (*ip++) #define FETCH_LENGTH(len) do { len=*ip++; len<<=8; len+=*ip++; len<<=8; len+=*ip++; } while (0) +#define FETCH_WIDTH(width) do { width=*ip++; } while (0) #undef CLOCK #if VM_USE_CLOCK diff --git a/libguile/vm-i-loader.c b/libguile/vm-i-loader.c index 9ae49ed..8de7f00 100644 --- a/libguile/vm-i-loader.c +++ b/libguile/vm-i-loader.c @@ -72,31 +72,82 @@ VM_DEFINE_LOADER (82, load_number, "load-number") VM_DEFINE_LOADER (83, load_string, "load-string") { size_t len; + int width; + SCM str; + FETCH_LENGTH (len); + FETCH_WIDTH (width); SYNC_REGISTER (); - PUSH (scm_from_locale_stringn ((char *)ip, len)); - /* Was: scm_makfromstr (ip, len, 0) */ - ip += len; + if (width == 1) + { + char *buf; + str = scm_i_make_string (len, &buf); + memcpy (buf, (char *) ip, len); + } + else if (width == 4) + { + scm_t_wchar *wbuf; + str = scm_i_make_wide_string (len, &wbuf); + memcpy ((char *) wbuf, (char *) ip, len * width); + } + else + SCM_MISC_ERROR ("load-string: invalid character width", SCM_EOL); + PUSH (str); + ip += len * width; NEXT; } VM_DEFINE_LOADER (84, load_symbol, "load-symbol") { size_t len; + int width; + SCM str; FETCH_LENGTH (len); + FETCH_WIDTH (width); SYNC_REGISTER (); - PUSH (scm_from_locale_symboln ((char *)ip, len)); - ip += len; + if (width == 1) + { + char *buf; + str = scm_i_make_string (len, &buf); + memcpy (buf, (char *) ip, len); + } + else if (width == 4) + { + scm_t_wchar *wbuf; + str = scm_i_make_wide_string (len, &wbuf); + memcpy ((char *) wbuf, (char *) ip, len * width); + } + else + SCM_MISC_ERROR ("load-symbol: invalid character width", SCM_EOL); + PUSH (scm_string_to_symbol (str)); + ip += len * width; NEXT; } VM_DEFINE_LOADER (85, load_keyword, "load-keyword") { size_t len; + int width; + SCM str; FETCH_LENGTH (len); + FETCH_WIDTH (width); SYNC_REGISTER (); - PUSH (scm_from_locale_keywordn ((char *)ip, len)); - ip += len; + if (width == 1) + { + char *buf; + str = scm_i_make_string (len, &buf); + memcpy (buf, (char *) ip, len); + } + else if (width == 4) + { + scm_t_wchar *wbuf; + str = scm_i_make_wide_string (len, &wbuf); + memcpy ((char *) wbuf, (char *) ip, len * width); + } + else + SCM_MISC_ERROR ("load-keyword: invalid character width", SCM_EOL); + PUSH (scm_symbol_to_keyword (scm_string_to_symbol (str))); + ip += len * width; NEXT; } @@ -132,13 +183,29 @@ VM_DEFINE_INSTRUCTION (87, link_now, "link-now", 0, 1, 1) VM_DEFINE_LOADER (88, define, "define") { - SCM sym; + SCM str, sym; size_t len; + int width; FETCH_LENGTH (len); + FETCH_WIDTH (width); SYNC_REGISTER (); - sym = scm_from_locale_symboln ((char *)ip, len); - ip += len; + if (width == 1) + { + char *buf; + str = scm_i_make_string (len, &buf); + memcpy (buf, (char *) ip, len); + } + else if (width == 4) + { + scm_t_wchar *wbuf; + str = scm_i_make_wide_string (len, &wbuf); + memcpy ((char *) wbuf, (char *) ip, len * width); + } + else + SCM_MISC_ERROR ("load define: invalid character width", SCM_EOL); + sym = scm_string_to_symbol (str); + ip += len * width; SYNC_REGISTER (); PUSH (scm_sym2var (sym, scm_current_module_lookup_closure (), SCM_BOOL_T)); diff --git a/module/language/assembly.scm b/module/language/assembly.scm index 3a1da4f..5571bee 100644 --- a/module/language/assembly.scm +++ b/module/language/assembly.scm @@ -34,6 +34,10 @@ ;; lengths are encoded in 3 bytes (define *len-len* 3) +;; the number of bytes per string character is encoded in 1 byte +(define *width-len* 1) + + (define (byte-length assembly) (pmatch assembly (,label (guard (not (pair? label))) @@ -45,15 +49,15 @@ ((load-number ,str) (+ 1 *len-len* (string-length str))) ((load-string ,str) - (+ 1 *len-len* (string-length str))) + (+ 1 *len-len* *width-len* (* (string-width str) (string-length str)))) ((load-symbol ,str) - (+ 1 *len-len* (string-length str))) + (+ 1 *len-len* *width-len* (* (string-width str) (string-length str)))) ((load-keyword ,str) - (+ 1 *len-len* (string-length str))) + (+ 1 *len-len* *width-len* (* (string-width str) (string-length str)))) ((load-array ,bv) (+ 1 *len-len* (bytevector-length bv))) ((define ,str) - (+ 1 *len-len* (string-length str))) + (+ 1 *len-len* *width-len* (* (string-width str) (string-length str)))) ((load-program ,nargs ,nrest ,nlocs ,labels ,len ,meta . ,code) (+ 1 *program-header-len* len (if meta (1- (byte-length meta)) 0))) ((,inst . _) (guard (>= (instruction-length inst) 0)) diff --git a/module/language/assembly/compile-bytecode.scm b/module/language/assembly/compile-bytecode.scm index bed0fb2..840c73b 100644 --- a/module/language/assembly/compile-bytecode.scm +++ b/module/language/assembly/compile-bytecode.scm @@ -65,6 +65,12 @@ (write-byte (logand (ash x -8) 255)) (write-byte (logand (ash x -16) 255)) (write-byte (logand (ash x -24) 255))) + (define (write-uint32 x) (case byte-order + ((1234) (write-uint32-le x)) + ((4321) (write-uint32-be x)) + (else (error "unknown endianness" byte-order)))) + (define (write-wide-string s) + (string-for-each (lambda (c) (write-uint32 (char->integer c))) s)) (define (write-loader-len len) (write-byte (ash len -16)) (write-byte (logand (ash len -8) 255)) @@ -72,6 +78,14 @@ (define (write-loader str) (write-loader-len (string-length str)) (write-string str)) + (define (write-sized-loader str) + (let ((len (string-length str)) + (wid (string-width str))) + (write-loader-len len) + (write-byte wid) + (if (= wid 4) + (write-wide-string str) + (write-string str)))) (define (write-bytevector bv) (write-loader-len (bytevector-length bv)) ;; Ew! @@ -89,10 +103,6 @@ (write-uint16 (case byte-order ((1234) write-uint16-le) ((4321) write-uint16-be) - (else (error "unknown endianness" byte-order)))) - (write-uint32 (case byte-order - ((1234) write-uint32-le) - ((4321) write-uint32-be) (else (error "unknown endianness" byte-order))))) (let ((opcode (instruction->opcode inst)) (len (instruction-length inst))) @@ -126,11 +136,11 @@ ((load-unsigned-integer ,str) (write-loader str)) ((load-integer ,str) (write-loader str)) ((load-number ,str) (write-loader str)) - ((load-string ,str) (write-loader str)) - ((load-symbol ,str) (write-loader str)) - ((load-keyword ,str) (write-loader str)) + ((load-string ,str) (write-sized-loader str)) + ((load-symbol ,str) (write-sized-loader str)) + ((load-keyword ,str) (write-sized-loader str)) ((load-array ,bv) (write-bytevector bv)) - ((define ,str) (write-loader str)) + ((define ,str) (write-sized-loader str)) ((br ,l) (write-break l)) ((br-if ,l) (write-break l)) ((br-if-not ,l) (write-break l)) -- 1.6.0.6