[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert_text, conv
From: |
Patrice Dumas |
Subject: |
branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output), tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units): cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8 in self->use_unicode_text. |
Date: |
Thu, 16 Nov 2023 11:50:31 -0500 |
This is an automated email from the git hooks/post-receive script.
pertusus pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new 30b3426ae0 * tp/Texinfo/Convert/HTML.pm (_convert_text, convert,
output), tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units):
cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8 in
self->use_unicode_text.
30b3426ae0 is described below
commit 30b3426ae02da61203466ea14519c539fdc3f7fb
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Thu Nov 16 17:49:49 2023 +0100
* tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output),
tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units):
cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8
in self->use_unicode_text.
* tp/Texinfo/Convert/HTML.pm (output): keep documentlanguage
as in preamble until the beginning of conversion.
* tp/Texinfo/XS/convert/convert_html.c (protect_text_unicode_text),
tp/Texinfo/XS/main/unicode.h (OTXI_UNICODE_TEXT_CASES): add conversion
of text to HTML with encoded characters.
---
ChangeLog | 14 ++++++++++
tp/Texinfo/Convert/HTML.pm | 46 +++++++++++++++++++++-----------
tp/Texinfo/XS/convert/ConvertXS.xs | 7 +++++
tp/Texinfo/XS/convert/convert_html.c | 31 +++++++++++++++++++---
tp/Texinfo/XS/convert/converter.h | 3 ++-
tp/Texinfo/XS/main/converter_types.h | 2 ++
tp/Texinfo/XS/main/unicode.h | 51 ++++++++++++++++++++++++++++++++++++
7 files changed, 134 insertions(+), 20 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index e28379f95c..8f4274e469 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2023-11-16 Patrice Dumas <pertusus@free.fr>
+
+ * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output),
+ tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units):
+ cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8
+ in self->use_unicode_text.
+
+ * tp/Texinfo/Convert/HTML.pm (output): keep documentlanguage
+ as in preamble until the beginning of conversion.
+
+ * tp/Texinfo/XS/convert/convert_html.c (protect_text_unicode_text),
+ tp/Texinfo/XS/main/unicode.h (OTXI_UNICODE_TEXT_CASES): add conversion
+ of text to HTML with encoded characters.
+
2023-11-15 Gavin Smith <gavinsmith0123@gmail.com>
* tp/Texinfo/XS/xspara.c (xspara_add_text, xspara__add_next):
diff --git a/tp/Texinfo/Convert/HTML.pm b/tp/Texinfo/Convert/HTML.pm
index e83eb06c5a..242b432602 100644
--- a/tp/Texinfo/Convert/HTML.pm
+++ b/tp/Texinfo/Convert/HTML.pm
@@ -6564,13 +6564,13 @@ sub _convert_text($$$)
#$text = &{$self->formatting_function('format_protect_text')}($self, $text);
$text = _default_format_protect_text($self, $text);
- # API info: get_conf() API code conforming would be:
+ # API info: for efficiency, we cache the result of the calls to configuration
+ # in $self->{'use_unicode_text'}.
+ # API code conforming would be:
#if ($self->get_conf('OUTPUT_CHARACTERS')
# and $self->get_conf('OUTPUT_ENCODING_NAME')
# and $self->get_conf('OUTPUT_ENCODING_NAME') eq 'utf-8') {
- if ($self->{'conf'}->{'OUTPUT_CHARACTERS'}
- and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'}
- and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'} eq 'utf-8') {
+ if ($self->{'use_unicode_text'}) {
$text = Texinfo::Convert::Unicode::unicode_text($text,
(in_code($self) or in_math($self)));
# API info: in_code() API code conforming and
@@ -11122,6 +11122,13 @@ sub convert($$)
$self->_sort_index_entries();
+ # cache, as it is checked for each text element
+ if ($self->{'conf'}->{'OUTPUT_CHARACTERS'}
+ and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'}
+ and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'} eq 'utf-8') {
+ $self->{'use_unicode_text'} = 1;
+ }
+
my ($output_units, $special_units, $associated_special_units)
= $self->_prepare_conversion_units($root, undef);
@@ -11698,14 +11705,20 @@ sub output($$)
$self->{'document_name'} = $document_name;
$self->{'destination_directory'} = $destination_directory;
- # set information, to have it available for the conversions below,
- # in translate_names called by _prepare_conversion_units and in
- # titles formatting.
+ # set information, to have it available for the conversions
+ # in translate_names
# Some information is not available yet.
$self->_reset_info();
$self->_sort_index_entries();
+ # cache, as it is checked for each text element
+ if ($self->{'conf'}->{'OUTPUT_CHARACTERS'}
+ and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'}
+ and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'} eq 'utf-8') {
+ $self->{'use_unicode_text'} = 1;
+ }
+
# Get the list of output units to be processed.
my ($output_units, $special_units, $associated_special_units)
= $self->_prepare_conversion_units($root, $document_name);
@@ -11719,8 +11732,8 @@ sub output($$)
$output_file, $destination_directory, $output_filename,
$document_name);
- # set information, to have it ready for
- # run_stage_handlers. Some information is not available yet.
+ # set information, to have it ready for run_stage_handlers and for titles
+ # formatting. Some information is not available yet.
$self->_reset_info();
my $structure_status = $self->run_stage_handlers($root, 'structure');
@@ -11808,11 +11821,6 @@ sub output($$)
= &{$self->formatting_function('format_comment')}($self,
$copying_comment);
}
}
- $self->set_global_document_commands('before', ['documentlanguage']);
-
- if ($default_document_language ne $preamble_document_language) {
- $self->_translate_names();
- }
# documentdescription
if (defined($self->get_conf('documentdescription'))) {
@@ -11832,7 +11840,8 @@ sub output($$)
# Some information is not available yet.
$self->_reset_info();
-
+ # TODO document that this stage handler is called with end of
+ # preamble documentlanguage.
my $init_status = $self->run_stage_handlers($root, 'init');
unless ($init_status < $handler_fatal_error_level
and $init_status > -$handler_fatal_error_level) {
@@ -11840,10 +11849,15 @@ sub output($$)
return undef;
}
-
$self->_prepare_title_titlepage($output_units, $output_file,
$output_filename);
+ $self->set_global_document_commands('before', ['documentlanguage']);
+
+ if ($default_document_language ne $preamble_document_language) {
+ $self->_translate_names();
+ }
+
# complete information should be available.
$self->_reset_info();
diff --git a/tp/Texinfo/XS/convert/ConvertXS.xs
b/tp/Texinfo/XS/convert/ConvertXS.xs
index aeec850a6f..ed8323f0e2 100644
--- a/tp/Texinfo/XS/convert/ConvertXS.xs
+++ b/tp/Texinfo/XS/convert/ConvertXS.xs
@@ -318,6 +318,13 @@ html_prepare_conversion_units (SV *converter_in, ...)
self = set_output_converter_sv (converter_in,
"html_prepare_conversion_units");
+
+ if (self->conf->OUTPUT_CHARACTERS > 0
+ && self->conf->OUTPUT_ENCODING_NAME
+ /* not sure if strcasecmp is needed or not */
+ && !strcasecmp (self->conf->OUTPUT_ENCODING_NAME, "utf8"))
+ self->use_unicode_text = 1;
+
html_prepare_conversion_units (self,
&output_units_descriptor, &special_units_descriptor,
&associated_special_units_descriptor);
diff --git a/tp/Texinfo/XS/convert/convert_html.c
b/tp/Texinfo/XS/convert/convert_html.c
index 5d9383f5ca..52b30bb786 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -42,6 +42,8 @@
#include "call_html_perl_function.h"
/* for TREE_AND_STRINGS */
#include "document.h"
+/* for OTXI_UNICODE_TEXT_CASES */
+#include "unicode.h"
#include "convert_html.h"
@@ -472,8 +474,8 @@ in_math (CONVERTER *self)
int
in_preformatted_context (CONVERTER *self)
-{
- HTML_DOCUMENT_CONTEXT *top_document_ctx;
+{
+ HTML_DOCUMENT_CONTEXT *top_document_ctx;
top_document_ctx = html_top_document_context (self);
return top_integer_stack (&top_document_ctx->preformatted_context);
}
@@ -2283,7 +2285,7 @@ html_prepare_units_directions_files (CONVERTER *self,
return files_source_info;
}
-
+/* to be inlined in text parsing codes */
#define OTXI_PROTECT_XML_FORM_FEED_CASES(var) \
OTXI_PROTECT_XML_CASES(var) \
case '\f': \
@@ -2467,6 +2469,29 @@ protect_text_no_iso_entities (const char *text, TEXT
*result)
}
#undef ADDN
+void
+protect_text_unicode_text (const char *text, TEXT *result)
+{
+ const char *p = text;
+
+ while (*p)
+ {
+ int before_sep_nr = strcspn (p, "<>&\"\f" "-`'");
+ if (before_sep_nr)
+ {
+ text_append_n (result, p, before_sep_nr);
+ p += before_sep_nr;
+ }
+ if (!*p)
+ break;
+ switch (*p)
+ {
+ OTXI_PROTECT_XML_FORM_FEED_CASES(p)
+ OTXI_UNICODE_TEXT_CASES(p)
+ }
+ }
+}
+
char *
convert_table_term_type (CONVERTER *self, enum element_type type,
const ELEMENT *element, char *content)
diff --git a/tp/Texinfo/XS/convert/converter.h
b/tp/Texinfo/XS/convert/converter.h
index fd878f6fa5..3a39d772f3 100644
--- a/tp/Texinfo/XS/convert/converter.h
+++ b/tp/Texinfo/XS/convert/converter.h
@@ -10,9 +10,10 @@
/* for TARGET_FILENAME */
#include "utils.h"
+/* piece of code that can be inlined in text parsing codes */
#define OTXI_PROTECT_XML_CASES(var) \
case '<': \
- text_append_n(result, "<", 4); var++; \
+ text_append_n (result, "<", 4); var++; \
break; \
case '>': \
text_append_n (result, ">", 4); var++; \
diff --git a/tp/Texinfo/XS/main/converter_types.h
b/tp/Texinfo/XS/main/converter_types.h
index 49182a8f6f..8c37763734 100644
--- a/tp/Texinfo/XS/main/converter_types.h
+++ b/tp/Texinfo/XS/main/converter_types.h
@@ -451,6 +451,8 @@ typedef struct CONVERTER {
HTML_DOCUMENT_CONTEXT_STACK html_document_context;
STRING_STACK multiple_pass;
char *current_filename;
+ /* state common with perl converter, not transmitted to perl */
+ int use_unicode_text;
} CONVERTER;
typedef struct TYPE_CONVERSION_FUNCTION {
diff --git a/tp/Texinfo/XS/main/unicode.h b/tp/Texinfo/XS/main/unicode.h
index df7121bfa5..f42e43a232 100644
--- a/tp/Texinfo/XS/main/unicode.h
+++ b/tp/Texinfo/XS/main/unicode.h
@@ -18,6 +18,57 @@ typedef struct COMMAND_UNICODE {
int is_extra;
} COMMAND_UNICODE;
+/* can be inlined in text parsing codes */
+#define OTXI_UNICODE_TEXT_CASES(var) \
+ case '-': \
+ if (*(var+1) && !memcmp (var, "---", 3)) \
+ { \
+ var += 3; \
+ /* Unicode em dash U+2014 (0xE2 0x80 0x94) */ \
+ text_append_n (result, "\xE2\x80\x94", 3); \
+ } \
+ else if (!memcmp (var, "--", 2)) \
+ { \
+ var += 2; \
+ /* Unicode en dash U+2013 (0xE2 0x80 0x93) */ \
+ text_append_n (result, "\xE2\x80\x93", 3); \
+ } \
+ else \
+ { \
+ var++; \
+ text_append_n (result, var, 1); \
+ } \
+ break; \
+ case '`': \
+ if (!memcmp (var, "``", 2)) \
+ { \
+ var += 2; \
+ /* U+201C E2 80 9C */ \
+ text_append_n (result, "\xE2\x80\x9C", 3); \
+ } \
+ else \
+ { \
+ var++; \
+ /* U+2018 E2 80 98 */ \
+ text_append_n (result, "\xE2\x80\x98", 3); \
+ } \
+ break; \
+ case '\'': \
+ if (!memcmp (var, "''", 2)) \
+ { \
+ var += 2; \
+ /* U+201D E2 80 9D */ \
+ text_append_n (result, "\xE2\x80\x9D", 3); \
+ } \
+ else \
+ { \
+ var++; \
+ /* U+2019 E2 80 99 */ \
+ text_append_n (result, "\xE2\x80\x99", 3); \
+ } \
+ break;
+
+
extern char *unicode_diacritics[];
extern COMMAND_UNICODE unicode_character_brace_no_arg_commands[];
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output), tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units): cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8 in self->use_unicode_text.,
Patrice Dumas <=