[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: Revert change on 2023-11-09 "Locale-independent X
From: |
Gavin D. Smith |
Subject: |
branch master updated: Revert change on 2023-11-09 "Locale-independent XS paragraph formatting". |
Date: |
Fri, 10 Nov 2023 11:50:09 -0500 |
This is an automated email from the git hooks/post-receive script.
gavin pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new 56028a44ba Revert change on 2023-11-09 "Locale-independent XS
paragraph formatting".
56028a44ba is described below
commit 56028a44baffb7d4b7626fc557b974d66a0c5d98
Author: Gavin Smith <gavinsmith0123@gmail.com>
AuthorDate: Fri Nov 10 16:50:00 2023 +0000
Revert change on 2023-11-09 "Locale-independent XS paragraph
formatting".
This gives inconsistent test results between XS and pure Perl.
Reverting until this can be investigated.
---
ChangeLog | 8 +++
tp/Texinfo/XS/Makefile.am | 2 +-
tp/Texinfo/XS/xspara.c | 158 +++++++++++++++++++++++++++++++++++++++-------
3 files changed, 145 insertions(+), 23 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 2b22db2409..a5a51cc1bf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2023-11-10 Gavin Smith <gavinsmith0123@gmail.com>
+
+ Revert change on 2023-11-09 "Locale-independent XS paragraph
+ formatting".
+
+ This gives inconsistent test results between XS and pure Perl.
+ Reverting until this can be investigated.
+
2023-11-09 Patrice Dumas <pertusus@free.fr>
* tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units),
diff --git a/tp/Texinfo/XS/Makefile.am b/tp/Texinfo/XS/Makefile.am
index 6fc24a0b3a..6b63bd01f0 100644
--- a/tp/Texinfo/XS/Makefile.am
+++ b/tp/Texinfo/XS/Makefile.am
@@ -97,7 +97,7 @@ XSParagraph_la_SOURCES = XSParagraph.c xspara.c xspara.h \
XSParagraph_la_CFLAGS = $(XSLIBS_CFLAGS)
XSParagraph_la_CPPFLAGS = $(AM_CPPFLAGS) $(GNULIB_CPPFLAGS) $(XSLIBS_CPPFLAGS)
XSParagraph_la_LIBADD = $(builddir)/gnulib/lib/libgnu.la
-XSParagraph_la_LDFLAGS = $(AM_LDFLAGS) $(XSLIBS_LDFLAGS) $(LTLIBINTL)
$(LTLIBICONV) $(LTLIBUNISTRING) $(LTLIBC32CONV)
+XSParagraph_la_LDFLAGS = $(AM_LDFLAGS) $(XSLIBS_LDFLAGS) $(LTLIBINTL)
$(LTLIBICONV) $(LTLIBUNISTRING)
EXTRA_DIST += XSParagraph.xs MiscXS.xs
diff --git a/tp/Texinfo/XS/xspara.c b/tp/Texinfo/XS/xspara.c
index a80f05c683..60e3eba02a 100644
--- a/tp/Texinfo/XS/xspara.c
+++ b/tp/Texinfo/XS/xspara.c
@@ -29,9 +29,8 @@
perl.h includes ctype.h. */
#include <ctype.h>
#endif
-
-#include <unistr.h>
-#include <uchar.h>
+#include <wchar.h>
+#include <wctype.h>
/* See "How do I use all this in extensions" in 'man perlguts'. */
#define PERL_NO_GET_CONTEXT
@@ -85,7 +84,7 @@ typedef struct {
int end_line_count; /* Number of newlines so far in an output unit, i.e.
with add_text or add_next. */
- char32_t last_letter; /* Last letter in word, used to decide if we're
+ wint_t last_letter; /* Last letter in word, used to decide if we're
at the end of a sentence. */
/* Options set with set_space_protection. */
@@ -270,7 +269,101 @@ xspara__print_escaped_spaces (char *string, size_t len)
int
xspara_init (int unused, char *unused2)
{
- return 1;
+ char *utf8_locale = 0;
+ int len;
+ char *cur;
+ char *dot;
+
+ dTHX;
+
+#if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
+ /* needed due to thread-safe locale handling in newer perls */
+ switch_to_global_locale();
+#endif
+
+ if (setlocale (LC_CTYPE, "en_US.UTF-8")
+ || setlocale (LC_CTYPE, "en_US.utf8"))
+ goto success;
+
+ cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */
+ if (!cur)
+ goto failure;
+ len = strlen (cur);
+ if ((len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6))
+ || (len >= 5 && !memcmp (".utf8", cur + len - 5, 5))
+ || (len >= 6 && !memcmp (".utf-8", cur + len - 6, 6))
+ || (len >= 5 && !memcmp (".UTF8", cur + len - 5, 5)))
+ {
+ setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */
+ goto success;
+ }
+
+ /* Otherwise try altering the current locale name. */
+ dot = strchr (cur, '.');
+ if (!dot)
+ dot = cur + len;
+ utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */
+ memcpy (utf8_locale, cur, dot - cur);
+ dot = utf8_locale + (dot - cur);
+ memcpy (dot, ".UTF-8", 7);
+ if (setlocale (LC_CTYPE, utf8_locale))
+ goto success;
+
+ memcpy (dot, ".utf8", 6);
+ if (setlocale (LC_CTYPE, utf8_locale))
+ goto success;
+
+ /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */
+ {
+ FILE *p;
+ char *line = 0;
+ size_t n = 0;
+ ssize_t ret;
+ p = popen ("locale -a", "r");
+ if (!p)
+ goto failure;
+ while (1)
+ {
+ ret = getline (&line, &n, p);
+ if (ret == (ssize_t) -1)
+ {
+ free (line);
+ pclose (p);
+ goto failure;
+ }
+ if (strstr (line, "UTF-8") || strstr (line, "utf8"))
+ {
+ line[ret - 1] = '\0'; /* Remove trailing newline. */
+ if (setlocale (LC_CTYPE, line))
+ {
+ free (line);
+ pclose (p);
+ goto success;
+ }
+ }
+ }
+ }
+
+ if (1)
+ {
+failure:
+ return 0; /* failure */
+ }
+ else
+ {
+success: ;
+ free (utf8_locale);
+#if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
+ /* needed due to thread-safe locale handling in newer perls */
+ sync_locale();
+#endif
+ /*
+ fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n");
+ fprintf (stderr, "character encoding is: %s\n",
+ nl_langinfo (CODESET));
+ */
+ return 1; /* success */
+ }
}
/* Array for storing paragraph states which aren't in use. */
@@ -330,7 +423,7 @@ xspara_new (HV *conf)
state.max = 72;
state.indent_length_next = -1; /* Special value meaning undefined. */
state.end_sentence = eos_undef;
- state.last_letter = U'\0';
+ state.last_letter = L'\0';
if (conf)
xspara_init_state (conf);
@@ -448,7 +541,7 @@ xspara__end_line (void)
state.lines_counter++;
state.end_line_count++;
/* could be set to other values, anything that is not upper case. */
- state.last_letter = U'\n';
+ state.last_letter = L'\n';
}
char *
@@ -563,7 +656,7 @@ xspara_end (void)
fprintf (stderr, "PARA END\n");
/* probably not really useful, but cleaner */
- state.last_letter = U'\0';
+ state.last_letter = L'\0';
xspara__add_pending_word (&ret, state.add_final_space);
if (!state.no_final_newline && state.counter != 0)
@@ -630,10 +723,18 @@ xspara__add_next (TEXT *result, char *word, int word_len,
int transparent)
if (!strchr (end_sentence_characters
after_punctuation_characters, *p))
{
- char32_t wc;
- u8_mbtouc (&wc, p, len);
- state.last_letter = wc;
- break;
+ if (!PRINTABLE_ASCII(*p))
+ {
+ wchar_t wc = L'\0';
+ mbrtowc (&wc, p, len, NULL);
+ state.last_letter = wc;
+ break;
+ }
+ else
+ {
+ state.last_letter = btowc (*p);
+ break;
+ }
}
}
}
@@ -650,7 +751,7 @@ xspara__add_next (TEXT *result, char *word, int word_len,
int transparent)
/* Calculate length of multibyte string in characters. */
int len = 0;
int left = word_len;
- char32_t w;
+ wchar_t w;
char *p = word;
while (left > 0)
@@ -664,7 +765,7 @@ xspara__add_next (TEXT *result, char *word, int word_len,
int transparent)
continue;
}
- char_len = u8_mbtouc (&w, p, left);
+ char_len = mbrtowc (&w, p, left, NULL);
if (char_len == (size_t) -2) {
/* unfinished multibyte character */
char_len = left;
@@ -678,7 +779,7 @@ xspara__add_next (TEXT *result, char *word, int word_len,
int transparent)
}
left -= char_len;
- columns = c32width (w);
+ columns = wcwidth (w);
if (columns > 0)
len += columns;
@@ -732,7 +833,7 @@ xspara_add_end_sentence (int value)
void
xspara_allow_end_sentence (void)
{
- state.last_letter = U'a'; /* A lower-case letter. */
+ state.last_letter = L'a'; /* A lower-case letter. */
}
/* -1 in a parameter means leave that value as it is. */
@@ -781,12 +882,14 @@ enum text_class { type_NULL, type_spaces, type_regular,
type_double_width, type_EOS, type_finished,
type_unknown };
-/* Return string to be added to paragraph contents, wrapping text. */
+/* Return string to be added to paragraph contents, wrapping text. This
+ function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
+ work correctly. */
TEXT
xspara_add_text (char *text, int len)
{
char *p = text, *q = 0;
- char32_t wc, wc_fw;
+ wchar_t wc, wc_fw;
size_t next_len = 0;
int width;
static TEXT result;
@@ -843,7 +946,18 @@ xspara_add_text (char *text, int len)
}
else
{
- next_len = u8_mbtouc (&wc, q, len);
+ /* Set wc and next_len */
+ if (!PRINTABLE_ASCII(*q))
+ {
+ next_len = mbrtowc (&wc, q, len, NULL);
+ }
+ else
+ {
+ /* Functionally the same as mbrtowc but (tested) slightly
+ quicker. */
+ next_len = 1;
+ wc = btowc (*q);
+ }
if ((long) next_len == 0)
break; /* Null character. Shouldn't happen. */
@@ -856,7 +970,7 @@ xspara_add_text (char *text, int len)
/* Note: width == 0 includes accent characters which should not
properly increase the column count. This is not what the pure
Perl code does, though. */
- width = c32width (wc);
+ width = wcwidth (wc);
if (width == 1 || width == 0)
next_type = type_regular;
else if (width == 2)
@@ -977,7 +1091,7 @@ xspara_add_text (char *text, int len)
xspara__end_line ();
text_append (&result, "\n");
}
- state.last_letter = U' ';
+ state.last_letter = ' ';
}
/*************** Double width character. *********************/
@@ -1029,7 +1143,7 @@ xspara_add_text (char *text, int len)
if (strchr (end_sentence_characters, *q2) && !state.unfilled)
{
/* Doesn't count if preceded by an upper-case letter. */
- if (!c32isupper (state.last_letter))
+ if (!iswupper (state.last_letter))
{
if (state.french_spacing)
state.end_sentence = eos_present_frenchspacing;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: Revert change on 2023-11-09 "Locale-independent XS paragraph formatting".,
Gavin D. Smith <=