[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[no subject]
From: |
Patrice Dumas |
Date: |
Sat, 30 Dec 2023 04:08:34 -0500 (EST) |
branch: master
commit d4da0502ce2d3628e451cede2ad01a43c3646f0c
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Sat Dec 30 10:07:46 2023 +0100
* tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack): fix
level instack at the end of the first phase of gathering codepoints.
Do not double free new_eight_bit.
* tp/Texinfo/XS/main/utils.c (word_bytes_len_multibyte): add a
function that counts the bytes in contiguous word characters in a
string, emulating \p{Word} or \w with unicode in perl.
* tp/maintain/setup_converters_code_tables.pl (unicode_diacritics),
tp/Texinfo/XS/main/unicode.h (DIACRITIC_UNICODE),
tp/Texinfo/XS/main/unicode.c (unicode_accent): add the diacritic
numeric representation in addition to the UTF-8 encoded string.
* tp/Texinfo/XS/convert/converter.c (convert_accents),
tp/Texinfo/XS/main/convert_to_text.c (ascii_accent),
tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack)
(format_unicode_accents_stack_internal): add convert accents. Use
converter in accent formatting functions arguments passed to the
accents stacks conversion functions. Update callers.
* tp/Texinfo/XS/convert/converter.c (next_for_tieaccent)
(UNICODE_ACCENT_LETTER, xml_numeric_entity_accent): implement
xml_numeric_entity_accent.
* tp/Texinfo/XS/convert/convert_html.c
(html_accent_entities_html_accent_internal)
(html_accent_entities_html_accent)
(html_accent_entities_numeric_entities_accent)
(convert_accent_command, html_converter_initialize)
(html_free_converter, html_free_converter),
tp/Texinfo/XS/convert/get_html_perl_info.c
(html_converter_initialize_sv), tp/Texinfo/XS/main/converter_types.h
(ACCENT_ENTITY_INFO, CONVERTER): add a list of accent commands,
converter->accent_formatted_cmd, collected while getting the
formatting references. Get accent_entities information from perl to
C. Implement convert_accent_command in C.
---
ChangeLog | 39 ++++
tp/Texinfo/XS/convert/convert_html.c | 147 +++++++++++++++
tp/Texinfo/XS/convert/converter.c | 276 +++++++++++++++++++++++++---
tp/Texinfo/XS/convert/converter.h | 8 +
tp/Texinfo/XS/convert/get_html_perl_info.c | 68 +++++++
tp/Texinfo/XS/main/convert_to_text.c | 21 ++-
tp/Texinfo/XS/main/converter_types.h | 7 +
tp/Texinfo/XS/main/unicode.c | 47 ++---
tp/Texinfo/XS/main/unicode.h | 14 +-
tp/Texinfo/XS/main/utils.c | 37 ++++
tp/Texinfo/XS/main/utils.h | 1 +
tp/maintain/setup_converters_code_tables.pl | 9 +-
12 files changed, 606 insertions(+), 68 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index aea265909d..23298c3ab2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,42 @@
+2023-12-30 Patrice Dumas <pertusus@free.fr>
+
+ * tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack): fix
+ level instack at the end of the first phase of gathering codepoints.
+ Do not double free new_eight_bit.
+
+ * tp/Texinfo/XS/main/utils.c (word_bytes_len_multibyte): add a
+ function that counts the bytes in contiguous word characters in a
+ string, emulating \p{Word} or \w with unicode in perl.
+
+ * tp/maintain/setup_converters_code_tables.pl (unicode_diacritics),
+ tp/Texinfo/XS/main/unicode.h (DIACRITIC_UNICODE),
+ tp/Texinfo/XS/main/unicode.c (unicode_accent): add the diacritic
+ numeric representation in addition to the UTF-8 encoded string.
+
+ * tp/Texinfo/XS/convert/converter.c (convert_accents),
+ tp/Texinfo/XS/main/convert_to_text.c (ascii_accent),
+ tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack)
+ (format_unicode_accents_stack_internal): add convert accents. Use
+ converter in accent formatting functions arguments passed to the
+ accents stacks conversion functions. Update callers.
+
+ * tp/Texinfo/XS/convert/converter.c (next_for_tieaccent)
+ (UNICODE_ACCENT_LETTER, xml_numeric_entity_accent): implement
+ xml_numeric_entity_accent.
+
+ * tp/Texinfo/XS/convert/convert_html.c
+ (html_accent_entities_html_accent_internal)
+ (html_accent_entities_html_accent)
+ (html_accent_entities_numeric_entities_accent)
+ (convert_accent_command, html_converter_initialize)
+ (html_free_converter, html_free_converter),
+ tp/Texinfo/XS/convert/get_html_perl_info.c
+ (html_converter_initialize_sv), tp/Texinfo/XS/main/converter_types.h
+ (ACCENT_ENTITY_INFO, CONVERTER): add a list of accent commands,
+ converter->accent_formatted_cmd, collected while getting the
+ formatting references. Get accent_entities information from perl to
+ C. Implement convert_accent_command in C.
+
2023-12-30 Patrice Dumas <pertusus@free.fr>
* tp/Texinfo/Convert/Converter.pm (convert_accents): reorganize code
diff --git a/tp/Texinfo/XS/convert/convert_html.c
b/tp/Texinfo/XS/convert/convert_html.c
index 99d2fabcb0..9be35468e6 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -9024,6 +9024,122 @@ convert_math_command (CONVERTER *self, const enum
command_id cmd,
free (attribute_class);
}
+char *
+html_accent_entities_html_accent_internal (CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case,
+ int use_numeric_entities)
+{
+ char *text_set;
+
+ if (set_case)
+ {
+ int str_len = strlen (text);
+ if (str_len != 1 || !isascii_alnum (*text))
+ {
+ int w_len = word_bytes_len_multibyte (text);
+ if (w_len != str_len)
+ set_case = 0;
+ }
+ }
+
+ if (set_case)
+ text_set = to_upper_or_lower_multibyte (text, set_case);
+ else
+ text_set = strdup (text);
+
+ /* do not return a dotless i or j as such if it is further composed
+ with an accented letter, return the letter as is */
+ if (element->cmd == CM_dotless
+ && (!strcmp (text_set, "i") || !strcmp (text_set, "j")))
+ {
+ if (element->parent && element->parent->parent
+ && element->parent->parent->cmd)
+ {
+ enum command_id p_cmd = element->parent->parent->cmd;
+ if (builtin_command_data[p_cmd].flags & CF_accent
+ && p_cmd != CM_tieaccent)
+ {
+ return text_set;
+ }
+ }
+ }
+
+ if (use_numeric_entities)
+ {
+ char *formatted_accent
+ = xml_numeric_entity_accent (element->cmd, text_set);
+ if (formatted_accent)
+ {
+ free (text_set);
+ return formatted_accent;
+ }
+ }
+ else
+ {
+ char *formatted_accent;
+ if (strlen (text_set) == 1 && isascii_alpha (*text_set)
+ && self->accent_entities[element->cmd].entity
+ && self->accent_entities[element->cmd].characters
+ && strlen (self->accent_entities[element->cmd].characters)
+ && strrchr (self->accent_entities[element->cmd].characters,
+ *text_set))
+ {
+ xasprintf (&formatted_accent, "&%s%s;", text_set,
+ self->accent_entities[element->cmd].entity);
+ free (text_set);
+ return formatted_accent;
+ }
+ formatted_accent = xml_numeric_entity_accent (element->cmd, text_set);
+ if (formatted_accent)
+ {
+ free (text_set);
+ return formatted_accent;
+ }
+ }
+ fprintf (stderr, "NNNNNNNNNNn %s %s\n", builtin_command_name (element->cmd),
text_set);
+ return text_set;
+}
+
+char *
+html_accent_entities_html_accent (CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case)
+{
+ return html_accent_entities_html_accent_internal (self, text,
+ element, set_case, 0);
+}
+
+char *
+html_accent_entities_numeric_entities_accent (CONVERTER *self,
+ const char *text, const ELEMENT *element, int set_case)
+{
+ return html_accent_entities_html_accent_internal (self, text,
+ element, set_case, 1);
+}
+
+void
+convert_accent_command (CONVERTER *self, const enum command_id cmd,
+ const ELEMENT *element,
+ const HTML_ARGS_FORMATTED *args_formatted,
+ const char *content, TEXT *result)
+{
+ char *(*format_accents)(CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case);
+
+ int output_encoded_characters = (self->conf->OUTPUT_CHARACTERS > 0);
+
+ if (self->conf->USE_NUMERIC_ENTITY > 0)
+ format_accents = &html_accent_entities_numeric_entities_accent;
+ else
+ format_accents = &html_accent_entities_html_accent;
+
+ char *accent_text = convert_accents (self, element, &html_convert_tree,
+ format_accents, output_encoded_characters,
+ html_in_upper_case (self));
+
+ text_append (result, accent_text);
+ free (accent_text);
+}
+
void
convert_indicateurl_command (CONVERTER *self, const enum command_id cmd,
const ELEMENT *element,
@@ -11461,6 +11577,26 @@ html_converter_initialize (CONVERTER *self)
}
}
+ /* accents commands implemented in C, but not css strings accents */
+ if (self->accent_formatted_cmd.number)
+ {
+ for (i = 0; i < self->accent_formatted_cmd.number; i++)
+ {
+ enum command_id cmd = self->accent_formatted_cmd.list[i];
+ COMMAND_CONVERSION_FUNCTION *command_conversion
+ = &self->command_conversion_function[cmd];
+ if (command_conversion->status == FRS_status_default_set)
+ {
+ command_conversion->formatting_reference = 0;
+ command_conversion->status = FRS_status_internal;
+ command_conversion->command_conversion
+ = &convert_accent_command;
+ }
+ }
+ }
+
+
+
/* all the commands in style_formatted_cmd are implemented in C.
It is not only the style commands, some others too. indicateurl
is not in style_formatted_cmd for now either */
@@ -11958,6 +12094,15 @@ html_free_converter (CONVERTER *self)
}
}
+ for (i = 0; i < self->accent_formatted_cmd.number; i++)
+ {
+ enum command_id cmd = self->accent_formatted_cmd.list[i];
+ ACCENT_ENTITY_INFO *accent_info
+ = &self->accent_entities[cmd];
+ free (accent_info->entity);
+ free (accent_info->characters);
+ }
+
for (i = 0; i < self->style_formatted_cmd.number; i++)
{
enum command_id cmd = self->style_formatted_cmd.list[i];
@@ -12023,6 +12168,8 @@ html_free_converter (CONVERTER *self)
free (self->no_arg_formatted_cmd.list);
+ free (self->accent_formatted_cmd.list);
+
free (self->style_formatted_cmd.list);
free (self->pending_closes.stack);
diff --git a/tp/Texinfo/XS/convert/converter.c
b/tp/Texinfo/XS/convert/converter.c
index a353beb1a2..cc5754b442 100644
--- a/tp/Texinfo/XS/convert/converter.c
+++ b/tp/Texinfo/XS/convert/converter.c
@@ -20,6 +20,10 @@
#include <string.h>
#include <stdio.h>
#include <stddef.h>
+#include <inttypes.h>
+#include <unistr.h>
+#include <uniconv.h>
+#include <unictype.h>
#include "command_ids.h"
#include "tree_types.h"
@@ -32,6 +36,7 @@
#include "convert_utils.h"
#include "translations.h"
#include "manipulate_tree.h"
+#include "unicode.h"
#include "converter.h"
static CONVERTER **converter_list;
@@ -302,6 +307,94 @@ node_information_filename (CONVERTER *self, char
*normalized,
return filename;
}
+ELEMENT *
+float_type_number (CONVERTER *self, const ELEMENT *float_e)
+{
+ ELEMENT *tree = 0;
+ ELEMENT *type_element = 0;
+ NAMED_STRING_ELEMENT_LIST *replaced_substrings
+ = new_named_string_element_list ();
+ char *float_type = lookup_extra_string (float_e, "float_type");
+ char *float_number = lookup_extra_string (float_e, "float_number");
+
+ if (float_type && strlen (float_type))
+ type_element = float_e->args.list[0];
+
+ if (float_number)
+ {
+ ELEMENT *e_number = new_element (ET_NONE);
+ text_append (&e_number->text, float_number);
+ add_element_to_named_string_element_list (replaced_substrings,
+ "float_number", e_number);
+ }
+
+ if (type_element)
+ {
+ ELEMENT *type_element_copy = copy_tree (type_element);
+ add_element_to_named_string_element_list (replaced_substrings,
+ "float_type", type_element_copy);
+ if (float_number)
+ tree = gdt_tree ("{float_type} {float_number}", self->document,
+ self->conf, replaced_substrings, 0, 0);
+ else
+ tree = gdt_tree ("{float_type}", self->document, self->conf,
+ replaced_substrings, 0, 0);
+ }
+ else if (float_number)
+ tree = gdt_tree ("{float_number}", self->document, self->conf,
+ replaced_substrings, 0, 0);
+
+ destroy_named_string_element_list (replaced_substrings);
+
+ return tree;
+}
+
+char *
+convert_accents (CONVERTER *self, const ELEMENT *accent,
+ char *(*convert_tree)(CONVERTER *self, const ELEMENT *tree, char
*explanation),
+ char *(*format_accent)(CONVERTER *self, const char *text, const ELEMENT
*element,
+ int set_case),
+ int output_encoded_characters,
+ int set_case)
+{
+ ACCENTS_STACK *accent_stack = find_innermost_accent_contents (accent);
+ const ELEMENT_STACK *stack;
+ char *arg_text;
+ char *result;
+ int i;
+
+ if (accent_stack->argument)
+ arg_text = (*convert_tree) (self, accent_stack->argument, 0);
+ else
+ arg_text = strdup ("");
+
+ if (output_encoded_characters)
+ {
+ char *encoded = encoded_accents (self, arg_text, &accent_stack->stack,
+ self->conf->OUTPUT_ENCODING_NAME,
format_accent,
+ set_case);
+ if (encoded)
+ {
+ free (arg_text);
+ destroy_accent_stack (accent_stack);
+ return encoded;
+ }
+ }
+
+ stack = &accent_stack->stack;
+ result = arg_text;
+ for (i = stack->top - 1; i >= 0; i--)
+ {
+ const ELEMENT *accent_command = stack->stack[i];
+ char *formatted_accent = (*format_accent) (self, result, accent_command,
+ set_case);
+ free (result);
+ result = formatted_accent;
+ }
+ destroy_accent_stack (accent_stack);
+ return result;
+}
+
ELEMENT_LIST *
comma_index_subentries_tree (const ELEMENT *current_entry,
char *separator)
@@ -652,44 +745,167 @@ xml_protect_text (const char *text, TEXT *result)
}
}
-ELEMENT *
-float_type_number (CONVERTER *self, const ELEMENT *float_e)
+static char *
+next_for_tieaccent (const char *text, const char **next)
{
- ELEMENT *tree = 0;
- ELEMENT *type_element = 0;
- NAMED_STRING_ELEMENT_LIST *replaced_substrings
- = new_named_string_element_list ();
- char *float_type = lookup_extra_string (float_e, "float_type");
- char *float_number = lookup_extra_string (float_e, "float_number");
+ const char *p;
+ if (!strlen (text))
+ {
+ return 0;
+ }
+ if (text[0] == '&')
+ {
+ if (strlen (text) > 3 && isascii_alnum(*(text+1)))
+ {
+ p = text +2;
+ while (*p)
+ {
+ if (*p == ';')
+ {
+ p++;
+ *next = p;
+ return strndup (text, p - text);
+ }
+ else if (isascii_alnum (*p))
+ {
+ p++;
+ }
+ else
+ break;
+ }
+ }
+ return 0;
+ }
+ else
+ {
+ uint8_t *encoded_u8 = u8_strconv_from_encoding (text, "UTF-8",
+ iconveh_question_mark);
+ ucs4_t first_char;
+ u8_next (&first_char, encoded_u8);
+ if (uc_is_general_category (first_char, UC_CATEGORY_L)
+ /* ASCII digits */
+ || (first_char >= 0x0030 && first_char <= 0x0039))
+ {
+ char *first_char_text;
+ uint8_t *first_char_u8 = malloc (7 * sizeof(uint8_t));
+ int first_char_len = u8_uctomb (first_char_u8, first_char, 6);
+ if (first_char_len < 0)
+ fatal ("u8_uctomb returns negative value");
+ first_char_u8[first_char_len] = 0;
+ first_char_text = u8_strconv_to_encoding (first_char_u8, "UTF-8",
+ iconveh_question_mark);
+ free (first_char_u8);
+ p = text + strlen (first_char_text);
+ *next = p;
+ return first_char_text;
+ }
+ return 0;
+ }
+}
- if (float_type && strlen (float_type))
- type_element = float_e->args.list[0];
+typedef struct UNICODE_ACCENT_LETTER {
+ enum command_id cmd;
+ char *letter;
+ char *numerical_entity;
+} UNICODE_ACCENT_LETTER;
- if (float_number)
+/* only those that are not obtained through diacritic + normalization */
+static UNICODE_ACCENT_LETTER unicode_accented_letters[] = {
+{CM_dotless, "i", "305"},
+{CM_dotless, "j", "567"},
+{0, 0, 0}
+};
+
+char *
+xml_numeric_entity_accent (enum command_id cmd, const char *text)
+{
+ char *result;
+ if (! builtin_command_data[cmd].flags & CF_accent)
{
- ELEMENT *e_number = new_element (ET_NONE);
- text_append (&e_number->text, float_number);
- add_element_to_named_string_element_list (replaced_substrings,
- "float_number", e_number);
+ return 0;
}
- if (type_element)
+ if (strlen (text) == 1 && isascii_alpha (*text))
{
- ELEMENT *type_element_copy = copy_tree (type_element);
- add_element_to_named_string_element_list (replaced_substrings,
- "float_type", type_element_copy);
- if (float_number)
- tree = gdt_tree ("{float_type} {float_number}", self->document,
- self->conf, replaced_substrings, 0, 0);
- else
- tree = gdt_tree ("{float_type}", self->document, self->conf,
- replaced_substrings, 0, 0);
+ int i;
+ for (i = 0; unicode_accented_letters[i].cmd; i++)
+ {
+ UNICODE_ACCENT_LETTER *letter = &unicode_accented_letters[i];
+ if (cmd == letter->cmd && ! strcmp (text, letter->letter))
+ {
+ xasprintf (&result, "&#%s;", letter->numerical_entity);
+ return result;
+ }
+ }
}
- else if (float_number)
- tree = gdt_tree ("{float_number}", self->document, self->conf,
- replaced_substrings, 0, 0);
- destroy_named_string_element_list (replaced_substrings);
+ if (unicode_diacritics[cmd].text)
+ {
+ if (cmd != CM_tieaccent)
+ {
+ if (strlen (text) == 1 && isascii_alpha (*text))
+ {
+ char *accented_char;
+ char *normalized_char;
+ uint8_t *encoded_u8;
+ ucs4_t first_char;
+ const uint8_t *next;
+
+ xasprintf (&accented_char, "%s%s", text,
+ unicode_diacritics[cmd].text);
+ normalized_char = normalize_NFC (accented_char);
+ encoded_u8 = u8_strconv_from_encoding (normalized_char, "UTF-8",
+ iconveh_question_mark);
+ next = u8_next (&first_char, encoded_u8);
+ if (next)
+ {
+ ucs4_t other_char;
+ const uint8_t *after = u8_next (&other_char, next);
+ next = after;
+ }
+ free (encoded_u8);
+ free (accented_char);
+ free (normalized_char);
+ if (!next)
+ {
+ char *entity;
+ /* hex entity
+ xasprintf (&entity, "&#%04lX;", first_char); */
+ /* seems to be the way for portable uint32_t unsigned integer format */
+ xasprintf (&entity, "&#%" PRIu32 ";", first_char);
+ return entity;
+ }
+ }
+ xasprintf (&result, "%s&#%s;", text,
unicode_diacritics[cmd].codepoint);
+ return result;
+ }
+ else
+ {
+ char *result;
+ const char *p = 0;
+ const char *remaining = 0;
+ char *first = next_for_tieaccent (text, &p);
+ char *second;
+ if (!first)
+ goto invalid;
+ second = next_for_tieaccent (p, &remaining);
+ if (second)
+ {
+ xasprintf (&result, "%s&#%s;%s%s", first,
+ unicode_diacritics[cmd].codepoint, second, remaining);
+ free (first);
+ free (second);
+ return result;
+ }
+ else
+ free (first);
- return tree;
+ invalid:
+ xasprintf (&result, "%s&#%s;", text,
+ unicode_diacritics[cmd].codepoint);
+ return result;
+ }
+ }
+ return 0;
}
+
diff --git a/tp/Texinfo/XS/convert/converter.h
b/tp/Texinfo/XS/convert/converter.h
index 0fb4de449d..364a1b052c 100644
--- a/tp/Texinfo/XS/convert/converter.h
+++ b/tp/Texinfo/XS/convert/converter.h
@@ -83,6 +83,13 @@ char *node_information_filename (CONVERTER *self, char
*normalized,
TARGET_FILENAME *normalized_sectioning_command_filename (CONVERTER *self,
const ELEMENT *command);
+char *convert_accents (CONVERTER *self, const ELEMENT *accent,
+ char *(*convert_tree)(CONVERTER *self, const ELEMENT *tree, char
*explanation),
+ char *(*format_accent)(CONVERTER *self, const char *text, const ELEMENT
*element,
+ int set_case),
+ int output_encoded_characters,
+ int set_case);
+
ELEMENT_LIST *comma_index_subentries_tree (const ELEMENT *current_entry,
char *separator);
void free_comma_index_subentries_tree (ELEMENT_LIST *element);
@@ -103,4 +110,5 @@ void free_generic_converter (CONVERTER *self);
void xml_format_text_with_numeric_entities (const char *text, TEXT *result);
+char *xml_numeric_entity_accent (enum command_id cmd, const char *text);
#endif
diff --git a/tp/Texinfo/XS/convert/get_html_perl_info.c
b/tp/Texinfo/XS/convert/get_html_perl_info.c
index 223a59f875..12c4f01427 100644
--- a/tp/Texinfo/XS/convert/get_html_perl_info.c
+++ b/tp/Texinfo/XS/convert/get_html_perl_info.c
@@ -152,6 +152,7 @@ html_converter_initialize_sv (SV *converter_sv,
SV **formatting_function_sv;
SV **sorted_special_unit_varieties_sv;
SV **no_arg_commands_formatting_sv;
+ SV **accent_entities_sv;
SV **style_commands_formatting_sv;
SV **types_open_sv;
SV **types_conversion_sv;
@@ -175,6 +176,7 @@ html_converter_initialize_sv (SV *converter_sv,
int nr_string_directions;
int nr_dir_str_contexts = TDS_context_string +1;
enum direction_string_type DS_type;
+ int nr_accent_cmd = 0;
dTHX;
@@ -326,8 +328,16 @@ html_converter_initialize_sv (SV *converter_sv,
conversion_formatting_reference, ref_name,
default_commands_conversion_hv,
commands_conversion_hv);
+
+ /* NOTE use the loop to collect the number of accent commands too */
+ if (builtin_command_data[i].flags & CF_accent)
+ nr_accent_cmd++;
}
+ converter->accent_formatted_cmd.list = (enum command_id *)
+ malloc (nr_accent_cmd * sizeof (enum command_id));
+ converter->accent_formatted_cmd.number = 0;
+
default_css_string_commands_conversion_hv
= (HV *)SvRV (default_css_string_commands_conversion);
/* copy the normal formatting references and replace the css strings
@@ -350,6 +360,14 @@ html_converter_initialize_sv (SV *converter_sv,
register_formatting_reference_default ("css_command_conversion",
conversion_formatting_reference, ref_name,
default_css_string_commands_conversion_hv);
+
+ /* NOTE we use the loop to collect the accent commands too */
+ if (builtin_command_data[i].flags & CF_accent)
+ {
+ converter->accent_formatted_cmd.list[
+ converter->accent_formatted_cmd.number] = i;
+ converter->accent_formatted_cmd.number++;
+ }
}
@@ -744,6 +762,56 @@ html_converter_initialize_sv (SV *converter_sv,
sizeof (enum command_id), compare_ints);
}
+ FETCH(accent_entities)
+
+ if (accent_entities_sv)
+ {
+ I32 hv_number;
+ I32 i;
+
+ HV *accent_entities_hv
+ = (HV *)SvRV (*accent_entities_sv);
+
+ hv_number = hv_iterinit (accent_entities_hv);
+
+ for (i = 0; i < hv_number; i++)
+ {
+ char *cmdname;
+ I32 retlen;
+ SV *spec_sv = hv_iternextsv (accent_entities_hv,
+ &cmdname, &retlen);
+ if (SvOK (spec_sv))
+ {
+ enum command_id cmd = lookup_builtin_command (cmdname);
+ if (!cmd)
+ fprintf (stderr, "ERROR: %s: no accent command\n", cmdname);
+ else
+ {
+ ACCENT_ENTITY_INFO *accent_info
+ = &converter->accent_entities[cmd];
+ AV *spec_av = (AV *)SvRV (spec_sv);
+ SV **entity_sv = av_fetch (spec_av, 0, 0);
+ SV **characters_sv = av_fetch (spec_av, 1, 0);
+
+ if (entity_sv)
+ {
+ char *entity = (char *) SvPVutf8_nolen (*entity_sv);
+ accent_info->entity = strdup (entity);
+ }
+
+ if (characters_sv && SvOK (*characters_sv))
+ {
+ char *characters
+ = (char *) SvPVutf8_nolen (*characters_sv);
+ if (strlen (characters))
+ accent_info->characters = strdup (characters);
+ }
+ }
+ }
+ }
+ }
+
+
FETCH(style_commands_formatting)
if (style_commands_formatting_sv)
diff --git a/tp/Texinfo/XS/main/convert_to_text.c
b/tp/Texinfo/XS/main/convert_to_text.c
index 2f8e280c3b..fa71a6b677 100644
--- a/tp/Texinfo/XS/main/convert_to_text.c
+++ b/tp/Texinfo/XS/main/convert_to_text.c
@@ -43,8 +43,12 @@
#include "cmd_symbol.c"
#include "cmd_text.c"
+
+/* the CONVERTER argument is not used, it is there solely to match the
+ calling prototype in accent formatting commands */
char *
-ascii_accent (const char *text, const ELEMENT *command, int set_case)
+ascii_accent (CONVERTER *self, const char *text,
+ const ELEMENT *command, int set_case)
{
const enum command_id cmd = command->cmd;
TEXT accent_text;
@@ -96,7 +100,8 @@ ascii_accents_internal (const char *text, const
ELEMENT_STACK *stack,
for (i = stack->top - 1; i >= 0; i--)
{
const ELEMENT *accent_command = stack->stack[i];
- char *formatted_accent = ascii_accent (result, accent_command, set_case);
+ char *formatted_accent = ascii_accent (0, result, accent_command,
+ set_case);
free (result);
result = formatted_accent;
}
@@ -180,7 +185,7 @@ char *
text_accents (const ELEMENT *accent, char *encoding, int set_case)
{
ACCENTS_STACK *accent_stack = find_innermost_accent_contents (accent);
- char *text;
+ char *arg_text;
char *result;
TEXT_OPTIONS *text_options = new_text_options ();
@@ -189,16 +194,16 @@ text_accents (const ELEMENT *accent, char *encoding, int
set_case)
text_options->set_case = set_case;
if (accent_stack->argument)
- text = convert_to_text (accent_stack->argument, text_options);
+ arg_text = convert_to_text (accent_stack->argument, text_options);
else
- text = strdup ("");
+ arg_text = strdup ("");
- result = encoded_accents (text, &accent_stack->stack, encoding,
+ result = encoded_accents (0, arg_text, &accent_stack->stack, encoding,
ascii_accent, set_case);
if (!result)
- result = ascii_accents_internal (text, &accent_stack->stack, set_case);
- free (text);
+ result = ascii_accents_internal (arg_text, &accent_stack->stack, set_case);
+ free (arg_text);
destroy_accent_stack (accent_stack);
destroy_text_options (text_options);
return result;
diff --git a/tp/Texinfo/XS/main/converter_types.h
b/tp/Texinfo/XS/main/converter_types.h
index a8ee33e43e..b9cee1b993 100644
--- a/tp/Texinfo/XS/main/converter_types.h
+++ b/tp/Texinfo/XS/main/converter_types.h
@@ -525,6 +525,11 @@ typedef struct HTML_ARGS_FORMATTED {
HTML_ARG_FORMATTED *args;
} HTML_ARGS_FORMATTED;
+typedef struct ACCENT_ENTITY_INFO {
+ char *entity;
+ char *characters;
+} ACCENT_ENTITY_INFO;
+
typedef struct COMMAND_CONVERSION_FUNCTION {
enum formatting_reference_status status;
/* points to the perl formatting reference if it is used for
@@ -694,9 +699,11 @@ typedef struct CONVERTER {
/* set for a converter */
COMMAND_ID_LIST no_arg_formatted_cmd;
COMMAND_ID_LIST style_formatted_cmd;
+ COMMAND_ID_LIST accent_formatted_cmd;
int code_types[TXI_TREE_TYPES_NUMBER];
char *pre_class_types[TXI_TREE_TYPES_NUMBER];
int upper_case[BUILTIN_CMD_NUMBER];
+ ACCENT_ENTITY_INFO accent_entities[BUILTIN_CMD_NUMBER];
STRING_WITH_LEN special_character[SC_non_breaking_space+1];
STRING_WITH_LEN line_break_element;
CSS_SELECTOR_STYLE_LIST css_element_class_styles;
diff --git a/tp/Texinfo/XS/main/unicode.c b/tp/Texinfo/XS/main/unicode.c
index 2095f0d1f4..1d293e50e0 100644
--- a/tp/Texinfo/XS/main/unicode.c
+++ b/tp/Texinfo/XS/main/unicode.c
@@ -91,7 +91,7 @@ unicode_accent (const char *text, const ELEMENT *e)
if (e->cmd == CM_dotless)
{
if (!e->parent || !e->parent->parent || !e->parent->parent->cmd
- || !unicode_diacritics[e->parent->parent->cmd])
+ || !unicode_diacritics[e->parent->parent->cmd].text)
{
if (!strcmp (text, "i"))
/* dotless i in UTF-8 */
@@ -103,7 +103,7 @@ unicode_accent (const char *text, const ELEMENT *e)
return strdup(text);
}
- if (unicode_diacritics[e->cmd])
+ if (unicode_diacritics[e->cmd].text)
{
static TEXT accented_text;
if (e->cmd == CM_tieaccent)
@@ -139,7 +139,7 @@ unicode_accent (const char *text, const ELEMENT *e)
text_init (&accented_text);
text_append (&accented_text, first_char_text);
free (first_char_text);
- text_append (&accented_text, unicode_diacritics[e->cmd]);
+ text_append (&accented_text,
unicode_diacritics[e->cmd].text);
next_text = u8_strconv_to_encoding (next, "UTF-8",
iconveh_question_mark);
text_append (&accented_text, next_text);
@@ -154,7 +154,7 @@ unicode_accent (const char *text, const ELEMENT *e)
}
text_init (&accented_text);
text_append (&accented_text, text);
- text_append (&accented_text, unicode_diacritics[e->cmd]);
+ text_append (&accented_text, unicode_diacritics[e->cmd].text);
result = normalize_NFC (accented_text.text);
free (accented_text.text);
}
@@ -172,9 +172,10 @@ compare_strings (const void *a, const void *b)
}
char *
-format_eight_bit_accents_stack (const char *text, const ELEMENT_STACK *stack,
- int encoding_index,
- char *(*format_accent)(const char *text, const ELEMENT *element, int
set_case),
+format_eight_bit_accents_stack (CONVERTER *self, const char *text,
+ const ELEMENT_STACK *stack, int encoding_index,
+ char *(*format_accent)(CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case),
int set_case)
{
int i, j, k;
@@ -195,7 +196,10 @@ format_eight_bit_accents_stack (const char *text, const
ELEMENT_STACK *stack,
results_stack[i] = unicode_accent (results_stack[i+1],
accent_command);
if (!results_stack[i])
- break;
+ {
+ i--;
+ break;
+ }
else if (set_case)
{
char *cased = to_upper_or_lower_multibyte (results_stack[i],
set_case);
@@ -203,6 +207,8 @@ format_eight_bit_accents_stack (const char *text, const
ELEMENT_STACK *stack,
results_stack[i] = cased;
}
}
+ /* undo the last decrease of i */
+ i++;
/*
At this point we have the unicode character results for the accent
@@ -282,7 +288,6 @@ format_eight_bit_accents_stack (const char *text, const
ELEMENT_STACK *stack,
}
free (prev_eight_bit);
- free (new_eight_bit);
/*
handle the remaining accents, that have not been converted to 8bit
@@ -292,7 +297,7 @@ format_eight_bit_accents_stack (const char *text, const
ELEMENT_STACK *stack,
{
const ELEMENT *accent_command = stack->stack[j];
char *formatted_result
- = (*format_accent) (result, accent_command, set_case);
+ = (*format_accent) (self, result, accent_command, set_case);
free (result);
result = formatted_result;
}
@@ -306,11 +311,11 @@ format_eight_bit_accents_stack (const char *text, const
ELEMENT_STACK *stack,
return result;
}
-/* FIXME converter in perl for (*format_accent), see encoded_accents comment*/
char *
-format_unicode_accents_stack_internal (const char *text,
+format_unicode_accents_stack_internal (CONVERTER *self, const char *text,
const ELEMENT_STACK *stack,
- char *(*format_accent)(const char *text, const ELEMENT *element, int
set_case),
+ char *(*format_accent)(CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case),
int set_case)
{
int i;
@@ -340,20 +345,18 @@ format_unicode_accents_stack_internal (const char *text,
{
const ELEMENT *accent_command = stack->stack[i];
char *formatted_result
- = (*format_accent) (result, accent_command, set_case);
+ = (*format_accent) (self, result, accent_command, set_case);
free (result);
result = formatted_result;
}
return result;
}
-/* FIXME a converter is passed in perl to (*format_accent), both
- directly and through functions. It is not clear whether it is
- actually used in perl, nor if it could be useful in C */
char *
-encoded_accents (const char *text, const ELEMENT_STACK *stack,
+encoded_accents (CONVERTER *self, const char *text, const ELEMENT_STACK *stack,
const char *encoding,
- char *(*format_accent)(const char *text, const ELEMENT *element, int
set_case),
+ char *(*format_accent)(CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case),
int set_case)
{
if (encoding)
@@ -374,7 +377,7 @@ encoded_accents (const char *text, const ELEMENT_STACK
*stack,
if (!strcmp (normalized_encoding, "utf-8"))
{
free (normalized_encoding);
- return format_unicode_accents_stack_internal (text, stack,
+ return format_unicode_accents_stack_internal (self, text, stack,
format_accent, set_case);
}
for (i = 0; i < sizeof (unicode_to_eight_bit)
@@ -390,8 +393,8 @@ encoded_accents (const char *text, const ELEMENT_STACK
*stack,
if (encoding_index >= 0)
{
free (normalized_encoding);
- return format_eight_bit_accents_stack (text, stack,
encoding_index,
- format_accent, set_case);
+ return format_eight_bit_accents_stack (self, text, stack,
+ encoding_index, format_accent, set_case);
}
}
free (normalized_encoding);
diff --git a/tp/Texinfo/XS/main/unicode.h b/tp/Texinfo/XS/main/unicode.h
index 0850a4b497..bfcc7b53a0 100644
--- a/tp/Texinfo/XS/main/unicode.h
+++ b/tp/Texinfo/XS/main/unicode.h
@@ -18,6 +18,11 @@ typedef struct COMMAND_UNICODE {
int is_extra;
} COMMAND_UNICODE;
+typedef struct DIACRITIC_UNICODE {
+ char *text; /* UTF-8 encoded */
+ char *codepoint;
+} DIACRITIC_UNICODE;
+
/* can be inlined in text parsing codes */
#define OTXI_UNICODE_TEXT_CASES(var) \
case '-': \
@@ -69,7 +74,7 @@ typedef struct COMMAND_UNICODE {
break;
-extern char *unicode_diacritics[];
+extern DIACRITIC_UNICODE unicode_diacritics[];
extern COMMAND_UNICODE unicode_character_brace_no_arg_commands[];
int unicode_point_decoded_in_encoding (char *encoding, char *codepoint);
@@ -78,9 +83,10 @@ char *normalize_NFC (const char *text);
char *normalize_NFKD (const char *text);
char *unicode_accent (const char *text, const ELEMENT *e);
-char *encoded_accents (const char *text, const ELEMENT_STACK *stack,
- const char *encoding,
- char *(*format_accent)(const char *text, const ELEMENT *element, int
set_case),
+char *encoded_accents (CONVERTER *self, const char *text,
+ const ELEMENT_STACK *stack, const char *encoding,
+ char *(*format_accent)(CONVERTER *self, const char *text,
+ const ELEMENT *element, int set_case),
int set_case);
char *unicode_brace_no_arg_command (enum command_id cmd, char *encoding);
diff --git a/tp/Texinfo/XS/main/utils.c b/tp/Texinfo/XS/main/utils.c
index a863e57f85..a5817fbbce 100644
--- a/tp/Texinfo/XS/main/utils.c
+++ b/tp/Texinfo/XS/main/utils.c
@@ -28,6 +28,7 @@
#include "unistr.h"
#include "unicase.h"
#include "uniwidth.h"
+#include <unictype.h>
#include "global_commands_types.h"
#include "options_types.h"
@@ -242,6 +243,42 @@ width_multibyte (const char *text)
return result;
}
+/* length of next word in multibyte setting. Should correspond to \w or
+ \p{Word} in perl */
+int
+word_bytes_len_multibyte (const char *text)
+{
+ uint8_t *encoded_u8 = u8_strconv_from_encoding (text, "UTF-8",
+ iconveh_question_mark);
+ uint8_t *current_u8 = encoded_u8;
+ int len = 0;
+ while (1)
+ {
+ ucs4_t next_char;
+ int new_len = u8_strmbtouc (&next_char, current_u8);
+ if (!new_len)
+ {
+ break;
+ }
+ /* (\p{Alnum} = \p{Alphabetic} + \p{Nd}) + \pM + \p{Pc}
+ + \p{Join_Control} */
+ if (uc_is_general_category (next_char, UC_CATEGORY_M)
+ || uc_is_general_category (next_char, UC_CATEGORY_Nd)
+ || uc_is_property (next_char, UC_PROPERTY_ALPHABETIC)
+ || uc_is_property (next_char, UC_PROPERTY_JOIN_CONTROL))
+ {
+ len += new_len;
+ current_u8 += new_len;
+ }
+ else
+ {
+ break;
+ }
+ }
+ free (encoded_u8);
+ return len;
+}
+
/* encoding and decoding. Use iconv. */
/* conversion to or from utf-8 should always be set before other
diff --git a/tp/Texinfo/XS/main/utils.h b/tp/Texinfo/XS/main/utils.h
index 342cc1f57d..795ca1ebc9 100644
--- a/tp/Texinfo/XS/main/utils.h
+++ b/tp/Texinfo/XS/main/utils.h
@@ -182,6 +182,7 @@ int isascii_upper (int c);
size_t count_multibyte (const char *text);
char *to_upper_or_lower_multibyte (const char *text, int lower_or_upper);
int width_multibyte (const char *text);
+int word_bytes_len_multibyte (const char *text);
void delete_global_info (GLOBAL_INFO *global_info_ref);
void delete_global_commands (GLOBAL_COMMANDS *global_commands_ref);
diff --git a/tp/maintain/setup_converters_code_tables.pl
b/tp/maintain/setup_converters_code_tables.pl
index f2bcf013fb..bac7566369 100755
--- a/tp/maintain/setup_converters_code_tables.pl
+++ b/tp/maintain/setup_converters_code_tables.pl
@@ -145,7 +145,7 @@ my %extra_unicode_map =
%Texinfo::Convert::Unicode::extra_unicode_map;
open (UNIC, '>', $unicode_file) or die "Open $unicode_file: $!\n";
print UNIC '#include "unicode.h"'."\n\n";
-print UNIC "char *unicode_diacritics[] = {\n";
+print UNIC "DIACRITIC_UNICODE unicode_diacritics[] = {\n";
foreach my $command_name (@commands_order) {
my $command = $command_name;
if (exists($name_commands{$command_name})) {
@@ -154,11 +154,12 @@ foreach my $command_name (@commands_order) {
#print UNIC "$command; ";
if (defined($unicode_diacritics{$command_name})) {
- my $result = chr(hex($unicode_diacritics{$command_name}));
+ my $numeric_codepoint = hex($unicode_diacritics{$command_name});
+ my $result = chr($numeric_codepoint);
my $protected = join ('', map {_protect_char($_)} split ('', $result));
- print UNIC "\"$protected\", /* $command */\n";
+ print UNIC "{\"$protected\", \"$numeric_codepoint\"}, /* $command */\n";
} else {
- print UNIC "0,\n";
+ print UNIC "{0, 0},\n";
}
}
print UNIC "};\n\n";