branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert_text, conv

From:	Patrice Dumas
Subject:	branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output), tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units): cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8 in self->use_unicode_text.
Date:	Thu, 16 Nov 2023 11:50:31 -0500
This is an automated email from the git hooks/post-receive script.

pertusus pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new 30b3426ae0 * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, 
output), tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units): 
cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8 in 
self->use_unicode_text.
30b3426ae0 is described below

commit 30b3426ae02da61203466ea14519c539fdc3f7fb
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Thu Nov 16 17:49:49 2023 +0100

    * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output),
    tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units):
    cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8
    in self->use_unicode_text.
    
    * tp/Texinfo/Convert/HTML.pm (output): keep documentlanguage
    as in preamble until the beginning of conversion.
    
    * tp/Texinfo/XS/convert/convert_html.c (protect_text_unicode_text),
    tp/Texinfo/XS/main/unicode.h (OTXI_UNICODE_TEXT_CASES): add conversion
    of text to HTML with encoded characters.
---
 ChangeLog                            | 14 ++++++++++
 tp/Texinfo/Convert/HTML.pm           | 46 +++++++++++++++++++++-----------
 tp/Texinfo/XS/convert/ConvertXS.xs   |  7 +++++
 tp/Texinfo/XS/convert/convert_html.c | 31 +++++++++++++++++++---
 tp/Texinfo/XS/convert/converter.h    |  3 ++-
 tp/Texinfo/XS/main/converter_types.h |  2 ++
 tp/Texinfo/XS/main/unicode.h         | 51 ++++++++++++++++++++++++++++++++++++
 7 files changed, 134 insertions(+), 20 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e28379f95c..8f4274e469 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2023-11-16  Patrice Dumas  <pertusus@free.fr>
+
+       * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output),
+       tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units):
+       cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8
+       in self->use_unicode_text.
+
+       * tp/Texinfo/Convert/HTML.pm (output): keep documentlanguage
+       as in preamble until the beginning of conversion.
+
+       * tp/Texinfo/XS/convert/convert_html.c (protect_text_unicode_text),
+       tp/Texinfo/XS/main/unicode.h (OTXI_UNICODE_TEXT_CASES): add conversion
+       of text to HTML with encoded characters.
+
 2023-11-15  Gavin Smith <gavinsmith0123@gmail.com>
 
        * tp/Texinfo/XS/xspara.c (xspara_add_text, xspara__add_next):
diff --git a/tp/Texinfo/Convert/HTML.pm b/tp/Texinfo/Convert/HTML.pm
index e83eb06c5a..242b432602 100644
--- a/tp/Texinfo/Convert/HTML.pm
+++ b/tp/Texinfo/Convert/HTML.pm
@@ -6564,13 +6564,13 @@ sub _convert_text($$$)
   #$text = &{$self->formatting_function('format_protect_text')}($self, $text);
   $text = _default_format_protect_text($self, $text);
 
-  # API info: get_conf() API code conforming would be:
+  # API info: for efficiency, we cache the result of the calls to configuration
+  # in $self->{'use_unicode_text'}.
+  # API code conforming would be:
   #if ($self->get_conf('OUTPUT_CHARACTERS')
   #    and $self->get_conf('OUTPUT_ENCODING_NAME')
   #    and $self->get_conf('OUTPUT_ENCODING_NAME') eq 'utf-8') {
-  if ($self->{'conf'}->{'OUTPUT_CHARACTERS'}
-      and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'}
-      and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'} eq 'utf-8') {
+  if ($self->{'use_unicode_text'}) {
     $text = Texinfo::Convert::Unicode::unicode_text($text,
                                         (in_code($self) or in_math($self)));
   # API info: in_code() API code conforming and
@@ -11122,6 +11122,13 @@ sub convert($$)
 
   $self->_sort_index_entries();
 
+  # cache, as it is checked for each text element
+  if ($self->{'conf'}->{'OUTPUT_CHARACTERS'}
+      and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'}
+      and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'} eq 'utf-8') {
+    $self->{'use_unicode_text'} = 1;
+  }
+
   my ($output_units, $special_units, $associated_special_units)
     = $self->_prepare_conversion_units($root, undef);
 
@@ -11698,14 +11705,20 @@ sub output($$)
   $self->{'document_name'} = $document_name;
   $self->{'destination_directory'} = $destination_directory;
 
-  # set information, to have it available for the conversions below,
-  # in translate_names called by _prepare_conversion_units and in
-  # titles formatting.
+  # set information, to have it available for the conversions
+  # in translate_names
   # Some information is not available yet.
   $self->_reset_info();
 
   $self->_sort_index_entries();
 
+  # cache, as it is checked for each text element
+  if ($self->{'conf'}->{'OUTPUT_CHARACTERS'}
+      and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'}
+      and $self->{'conf'}->{'OUTPUT_ENCODING_NAME'} eq 'utf-8') {
+    $self->{'use_unicode_text'} = 1;
+  }
+
   # Get the list of output units to be processed.
   my ($output_units, $special_units, $associated_special_units)
     = $self->_prepare_conversion_units($root, $document_name);
@@ -11719,8 +11732,8 @@ sub output($$)
                 $output_file, $destination_directory, $output_filename,
                 $document_name);
 
-  # set information, to have it ready for
-  # run_stage_handlers.  Some information is not available yet.
+  # set information, to have it ready for run_stage_handlers and for titles
+  # formatting.  Some information is not available yet.
   $self->_reset_info();
 
   my $structure_status = $self->run_stage_handlers($root, 'structure');
@@ -11808,11 +11821,6 @@ sub output($$)
        = &{$self->formatting_function('format_comment')}($self, 
$copying_comment);
     }
   }
-  $self->set_global_document_commands('before', ['documentlanguage']);
-
-  if ($default_document_language ne $preamble_document_language) {
-    $self->_translate_names();
-  }
 
   # documentdescription
   if (defined($self->get_conf('documentdescription'))) {
@@ -11832,7 +11840,8 @@ sub output($$)
   # Some information is not available yet.
   $self->_reset_info();
 
-
+  # TODO document that this stage handler is called with end of
+  # preamble documentlanguage.
   my $init_status = $self->run_stage_handlers($root, 'init');
   unless ($init_status < $handler_fatal_error_level
           and $init_status > -$handler_fatal_error_level) {
@@ -11840,10 +11849,15 @@ sub output($$)
     return undef;
   }
 
-
   $self->_prepare_title_titlepage($output_units, $output_file,
                                   $output_filename);
 
+  $self->set_global_document_commands('before', ['documentlanguage']);
+
+  if ($default_document_language ne $preamble_document_language) {
+    $self->_translate_names();
+  }
+
   # complete information should be available.
   $self->_reset_info();
 
diff --git a/tp/Texinfo/XS/convert/ConvertXS.xs 
b/tp/Texinfo/XS/convert/ConvertXS.xs
index aeec850a6f..ed8323f0e2 100644
--- a/tp/Texinfo/XS/convert/ConvertXS.xs
+++ b/tp/Texinfo/XS/convert/ConvertXS.xs
@@ -318,6 +318,13 @@ html_prepare_conversion_units (SV *converter_in, ...)
 
          self = set_output_converter_sv (converter_in,
                                          "html_prepare_conversion_units");
+
+         if (self->conf->OUTPUT_CHARACTERS > 0
+             && self->conf->OUTPUT_ENCODING_NAME
+             /* not sure if strcasecmp is needed or not */
+             && !strcasecmp (self->conf->OUTPUT_ENCODING_NAME, "utf8"))
+           self->use_unicode_text = 1;
+
          html_prepare_conversion_units (self,
               &output_units_descriptor, &special_units_descriptor,
               &associated_special_units_descriptor);
diff --git a/tp/Texinfo/XS/convert/convert_html.c 
b/tp/Texinfo/XS/convert/convert_html.c
index 5d9383f5ca..52b30bb786 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -42,6 +42,8 @@
 #include "call_html_perl_function.h"
 /* for TREE_AND_STRINGS */
 #include "document.h"
+/* for OTXI_UNICODE_TEXT_CASES */
+#include "unicode.h"
 #include "convert_html.h"
 
 
@@ -472,8 +474,8 @@ in_math (CONVERTER *self)
 
 int
 in_preformatted_context (CONVERTER *self)
-{    
-  HTML_DOCUMENT_CONTEXT *top_document_ctx;               
+{
+  HTML_DOCUMENT_CONTEXT *top_document_ctx;
   top_document_ctx = html_top_document_context (self);
   return top_integer_stack (&top_document_ctx->preformatted_context);
 }
@@ -2283,7 +2285,7 @@ html_prepare_units_directions_files (CONVERTER *self,
   return files_source_info;
 }
 
-
+/* to be inlined in text parsing codes */
 #define OTXI_PROTECT_XML_FORM_FEED_CASES(var) \
         OTXI_PROTECT_XML_CASES(var) \
         case '\f':          \
@@ -2467,6 +2469,29 @@ protect_text_no_iso_entities (const char *text, TEXT 
*result)
 }
 #undef ADDN
 
+void
+protect_text_unicode_text (const char *text, TEXT *result)
+{
+  const char *p = text;
+
+  while (*p)
+    {
+      int before_sep_nr = strcspn (p, "<>&\"\f" "-`'");
+      if (before_sep_nr)
+        {
+          text_append_n (result, p, before_sep_nr);
+          p += before_sep_nr;
+        }
+      if (!*p)
+        break;
+      switch (*p)
+        {
+        OTXI_PROTECT_XML_FORM_FEED_CASES(p)
+        OTXI_UNICODE_TEXT_CASES(p)
+        }
+    }
+}
+
 char *
 convert_table_term_type (CONVERTER *self, enum element_type type,
                         const ELEMENT *element, char *content)
diff --git a/tp/Texinfo/XS/convert/converter.h 
b/tp/Texinfo/XS/convert/converter.h
index fd878f6fa5..3a39d772f3 100644
--- a/tp/Texinfo/XS/convert/converter.h
+++ b/tp/Texinfo/XS/convert/converter.h
@@ -10,9 +10,10 @@
 /* for TARGET_FILENAME */
 #include "utils.h"
 
+/* piece of code that can be inlined in text parsing codes */
 #define OTXI_PROTECT_XML_CASES(var) \
         case '<':           \
-          text_append_n(result, "&lt;", 4); var++; \
+          text_append_n (result, "&lt;", 4); var++; \
           break;            \
         case '>':           \
           text_append_n (result, "&gt;", 4); var++; \
diff --git a/tp/Texinfo/XS/main/converter_types.h 
b/tp/Texinfo/XS/main/converter_types.h
index 49182a8f6f..8c37763734 100644
--- a/tp/Texinfo/XS/main/converter_types.h
+++ b/tp/Texinfo/XS/main/converter_types.h
@@ -451,6 +451,8 @@ typedef struct CONVERTER {
     HTML_DOCUMENT_CONTEXT_STACK html_document_context;
     STRING_STACK multiple_pass;
     char *current_filename;
+    /* state common with perl converter, not transmitted to perl */
+    int use_unicode_text;
 } CONVERTER;
 
 typedef struct TYPE_CONVERSION_FUNCTION {
diff --git a/tp/Texinfo/XS/main/unicode.h b/tp/Texinfo/XS/main/unicode.h
index df7121bfa5..f42e43a232 100644
--- a/tp/Texinfo/XS/main/unicode.h
+++ b/tp/Texinfo/XS/main/unicode.h
@@ -18,6 +18,57 @@ typedef struct COMMAND_UNICODE {
     int is_extra;
 } COMMAND_UNICODE;
 
+/* can be inlined in text parsing codes */
+#define OTXI_UNICODE_TEXT_CASES(var) \
+        case '-': \
+          if (*(var+1) && !memcmp (var, "---", 3)) \
+            { \
+              var += 3; \
+              /* Unicode em dash U+2014 (0xE2 0x80 0x94) */ \
+              text_append_n (result, "\xE2\x80\x94", 3); \
+            } \
+          else if (!memcmp (var, "--", 2)) \
+            { \
+              var += 2; \
+              /* Unicode en dash U+2013 (0xE2 0x80 0x93) */ \
+              text_append_n (result, "\xE2\x80\x93", 3); \
+            } \
+          else \
+            { \
+              var++; \
+              text_append_n (result, var, 1); \
+            } \
+          break; \
+        case '`': \
+          if (!memcmp (var, "``", 2)) \
+            { \
+              var += 2; \
+              /* U+201C E2 80 9C */ \
+              text_append_n (result, "\xE2\x80\x9C", 3); \
+            } \
+          else \
+            { \
+              var++; \
+              /* U+2018 E2 80 98 */ \
+              text_append_n (result, "\xE2\x80\x98", 3); \
+            } \
+          break; \
+        case '\'': \
+          if (!memcmp (var, "''", 2)) \
+            { \
+              var += 2; \
+              /* U+201D E2 80 9D */ \
+              text_append_n (result, "\xE2\x80\x9D", 3); \
+            } \
+          else \
+            { \
+              var++; \
+              /* U+2019 E2 80 99 */ \
+              text_append_n (result, "\xE2\x80\x99", 3); \
+            } \
+          break;
+
+
 extern char *unicode_diacritics[];
 extern COMMAND_UNICODE unicode_character_brace_no_arg_commands[];
[Prev in Thread]
Current Thread
[Next in Thread]
branch master updated: * tp/Texinfo/Convert/HTML.pm (_convert_text, convert, output), tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units): cache OUTPUT_CHARACTERS and OUTPUT_ENCODING_NAME equal to utf8 in self->use_unicode_text., Patrice Dumas <=