[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: Index sorting in C
From: |
Patrice Dumas |
Subject: |
branch master updated: Index sorting in C |
Date: |
Tue, 30 Jan 2024 16:53:43 -0500 |
This is an automated email from the git hooks/post-receive script.
pertusus pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new a7fb50d574 Index sorting in C
a7fb50d574 is described below
commit a7fb50d574c034bd0b8c23520ccd917c5b23fae5
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Tue Jan 30 22:46:28 2024 +0100
Index sorting in C
* tp/Texinfo/Indices.pm (setup_sortable_index_entries): rename
variables. Simplify code that determine that there is one non empty
entry/subentry.
* tp/Texinfo/XS/parsetexi/indices.c (enter_index_entry),
tp/Texinfo/XS/main/tree_types.h (INDEX_ENTRY): add index
entry number in index entry structure.
* tp/Texinfo/XS/convert/indices_in_conversion.c (set_sort_key)
(INDEX_SORT_STRING_KEY, index_entry_element_sort_string_key)
(setup_sortable_index_entries, LETTER_SORTABLE_ENTRIES)
(INDEX_LETTERS_SORTABLE_ENTRIES, compare_index_letter)
(compare_sortable_subentry_keys, compare_sortable_index_entry)
(sort_indices_by_letter),
tp/Texinfo/XS/convert/indices_in_conversion.h
(SORTABLE_INDEX_SUBENTRY, SORTABLE_INDEX_ENTRY)
(INDEX_SORTABLE_ENTRIES, INDICES_SORTABLE_ENTRIES): implement
index sorting by letter in C.
* tp/Texinfo/XS/configure.ac: check for newlocale and strxfrm_l.
* tp/Texinfo/Convert/HTML.pm (%XS_conversion_overrides)
(_NonXS_sort_index_entries, _XS_only_sort_index_entries)
(_sort_index_entries), tp/Texinfo/XS/convert/ConvertXS.xs
(html_sort_index_entries), tp/Texinfo/XS/convert/convert_html.c
(html_sort_index_entries): XS interface for index sorting in HTML,
if TEST is not set.
---
ChangeLog | 31 ++
tp/Texinfo/Convert/HTML.pm | 22 +-
tp/Texinfo/Indices.pm | 65 +--
tp/Texinfo/XS/configure.ac | 2 +
tp/Texinfo/XS/convert/ConvertXS.xs | 10 +
tp/Texinfo/XS/convert/convert_html.c | 14 +
tp/Texinfo/XS/convert/convert_html.h | 1 +
tp/Texinfo/XS/convert/indices_in_conversion.c | 570 ++++++++++++++++++++++++++
tp/Texinfo/XS/convert/indices_in_conversion.h | 31 ++
tp/Texinfo/XS/main/tree_types.h | 4 +-
tp/Texinfo/XS/main/utils.c | 4 +
tp/Texinfo/XS/parsetexi/indices.c | 4 +-
12 files changed, 722 insertions(+), 36 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 8e893ef4b6..da4abe7e33 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,34 @@
+2024-01-30 Patrice Dumas <pertusus@free.fr>
+
+ Index sorting in C
+
+ * tp/Texinfo/Indices.pm (setup_sortable_index_entries): rename
+ variables. Simplify code that determine that there is one non empty
+ entry/subentry.
+
+ * tp/Texinfo/XS/parsetexi/indices.c (enter_index_entry),
+ tp/Texinfo/XS/main/tree_types.h (INDEX_ENTRY): add index
+ entry number in index entry structure.
+
+ * tp/Texinfo/XS/convert/indices_in_conversion.c (set_sort_key)
+ (INDEX_SORT_STRING_KEY, index_entry_element_sort_string_key)
+ (setup_sortable_index_entries, LETTER_SORTABLE_ENTRIES)
+ (INDEX_LETTERS_SORTABLE_ENTRIES, compare_index_letter)
+ (compare_sortable_subentry_keys, compare_sortable_index_entry)
+ (sort_indices_by_letter),
+ tp/Texinfo/XS/convert/indices_in_conversion.h
+ (SORTABLE_INDEX_SUBENTRY, SORTABLE_INDEX_ENTRY)
+ (INDEX_SORTABLE_ENTRIES, INDICES_SORTABLE_ENTRIES): implement
+ index sorting by letter in C.
+ * tp/Texinfo/XS/configure.ac: check for newlocale and strxfrm_l.
+
+ * tp/Texinfo/Convert/HTML.pm (%XS_conversion_overrides)
+ (_NonXS_sort_index_entries, _XS_only_sort_index_entries)
+ (_sort_index_entries), tp/Texinfo/XS/convert/ConvertXS.xs
+ (html_sort_index_entries), tp/Texinfo/XS/convert/convert_html.c
+ (html_sort_index_entries): XS interface for index sorting in HTML,
+ if TEST is not set.
+
2024-01-29 Gavin Smith <gavinsmith0123@gmail.com>
* doc/texinfo.texi (Other Customization Variables),
diff --git a/tp/Texinfo/Convert/HTML.pm b/tp/Texinfo/Convert/HTML.pm
index 5688bf9c25..ccbc2473ad 100644
--- a/tp/Texinfo/Convert/HTML.pm
+++ b/tp/Texinfo/Convert/HTML.pm
@@ -267,6 +267,8 @@ my %XS_conversion_overrides = (
=> "Texinfo::Convert::ConvertXS::get_index_entries_sorted_by_letter",
"Texinfo::Convert::HTML::_XS_html_merge_index_entries"
=> "Texinfo::Convert::ConvertXS::html_merge_index_entries",
+ "Texinfo::Convert::HTML::_XS_only_sort_index_entries"
+ => "Texinfo::Convert::ConvertXS::html_sort_index_entries",
"Texinfo::Convert::HTML::_prepare_conversion_units"
=> "Texinfo::Convert::ConvertXS::html_prepare_conversion_units",
"Texinfo::Convert::HTML::_prepare_units_directions_files"
@@ -10281,7 +10283,7 @@ sub _XS_html_merge_index_entries($)
{
}
-sub _sort_index_entries($)
+sub _NonXS_sort_index_entries($)
{
my $self = shift;
@@ -10311,6 +10313,24 @@ sub _sort_index_entries($)
}
}
+sub _XS_only_sort_index_entries($)
+{
+ my $self = shift;
+ _NonXS_sort_index_entries($self);
+}
+
+sub _sort_index_entries($)
+{
+ my $self = shift;
+
+ # Sorting in Perl to have a reproducible output for tests
+ if ($self->get_conf('TEST')) {
+ _NonXS_sort_index_entries($self);
+ } else {
+ _XS_only_sort_index_entries($self);
+ }
+}
+
sub _prepare_index_entries_targets($)
{
my $self = shift;
diff --git a/tp/Texinfo/Indices.pm b/tp/Texinfo/Indices.pm
index b9322a1399..c1be210186 100644
--- a/tp/Texinfo/Indices.pm
+++ b/tp/Texinfo/Indices.pm
@@ -171,7 +171,7 @@ sub _collator_sort_index_entries($$$)
my $collator = shift;
my $key_index = 0;
- # the keys array corresponds to th emain entry and subentries
+ # the keys array corresponds to the main entry and subentries
foreach my $key1_str (@{$key1->{'keys'}}) {
my $res = _collator_sort_key($key1_str,
$key2->{'keys'}->[$key_index],
@@ -422,13 +422,14 @@ sub setup_sortable_index_entries($$$$$)
if ($in_code) {
Texinfo::Convert::Text::set_options_code($convert_text_options);
}
- my ($entry_key, $sort_entry_key)
+ my ($entry_sort_string, $entry_sort_key)
= _index_entry_element_sort_string_key($customization_information,
$index_entry, $main_entry_element,
$convert_text_options, $entries_collator);
- my @entry_keys;
- my @sort_entry_keys;
- if ($entry_key !~ /\S/) {
+ my $non_empty_index_subentries = 0;
+ my @entry_sort_strings;
+ my @entry_sort_keys;
+ if ($entry_sort_string !~ /\S/) {
my $entry_cmdname = $main_entry_element->{'cmdname'};
$entry_cmdname
= $main_entry_element->{'extra'}->{'original_def_cmdname'}
@@ -438,22 +439,23 @@ sub setup_sortable_index_entries($$$$$)
sprintf(__("empty index key in \@%s"),
$entry_cmdname),
$main_entry_element->{'source_info'});
- push @entry_keys, '';
- push @sort_entry_keys, '';
+ push @entry_sort_strings, '';
+ push @entry_sort_keys, '';
} else {
- push @entry_keys, $entry_key;
- push @sort_entry_keys, $sort_entry_key;
+ push @entry_sort_strings, $entry_sort_string;
+ push @entry_sort_keys, $entry_sort_key;
+ $non_empty_index_subentries++;
}
my $subentry_nr = 0;
my $subentry = $main_entry_element;
while ($subentry->{'extra'} and $subentry->{'extra'}->{'subentry'}) {
$subentry_nr ++;
$subentry = $subentry->{'extra'}->{'subentry'};
- my ($subentry_key, $sort_subentry_key)
+ my ($subentry_sort_string, $sort_subentry_key)
=
_index_entry_element_sort_string_key($customization_information,
$index_entry, $subentry, $convert_text_options,
$entries_collator);
- if ($subentry_key !~ /\S/) {
+ if ($subentry_sort_string !~ /\S/) {
my $entry_cmdname = $main_entry_element->{'cmdname'};
$entry_cmdname
= $main_entry_element->{'extra'}->{'original_def_cmdname'}
@@ -463,36 +465,35 @@ sub setup_sortable_index_entries($$$$$)
sprintf(__("empty index sub entry %d key in \@%s"),
$subentry_nr, $entry_cmdname),
$main_entry_element->{'source_info'});
- push @entry_keys, '';
- push @sort_entry_keys, '';
+ push @entry_sort_strings, '';
+ push @entry_sort_keys, '';
} else {
- push @entry_keys, $subentry_key;
- push @sort_entry_keys, $sort_subentry_key;
+ push @entry_sort_strings, $subentry_sort_string;
+ push @entry_sort_keys, $sort_subentry_key;
+ $non_empty_index_subentries++;
}
}
- foreach my $sub_entry_key (@sort_entry_keys) {
- if ($sub_entry_key ne '') {
- my @keys_and_alpha;
- for (my $i = 0; $i < scalar (@entry_keys); $i++) {
- my $alpha = 0;
- if ($entry_keys[$i] =~ /^[[:alpha:]]/) {
- $alpha = 1;
- }
- push @keys_and_alpha, [$sort_entry_keys[$i], $alpha];
+ if ($non_empty_index_subentries > 0) {
+ my @keys_and_alpha;
+ for (my $i = 0; $i < scalar (@entry_sort_strings); $i++) {
+ my $alpha = 0;
+ if ($entry_sort_strings[$i] =~ /^[[:alpha:]]/) {
+ $alpha = 1;
}
- my $sortable_entry = {'entry' => $index_entry,
- 'keys' => \@keys_and_alpha,
- 'entry_keys' => \@entry_keys,
- 'number' => $index_entry->{'entry_number'},
- 'index_name' => $entry_index_name};
- push @{$sortable_index_entries}, $sortable_entry;
- last;
+ push @keys_and_alpha, [$entry_sort_keys[$i], $alpha];
}
+ my $sortable_entry = {'entry' => $index_entry,
+ 'keys' => \@keys_and_alpha,
+ 'entry_keys' => \@entry_sort_strings,
+ 'number' => $index_entry->{'entry_number'},
+ 'index_name' => $entry_index_name};
+ push @{$sortable_index_entries}, $sortable_entry;
}
if ($in_code) {
Texinfo::Convert::Text::reset_options_code($convert_text_options);
}
- $index_entries_sort_strings->{$index_entry} = join(', ', @entry_keys);
+ $index_entries_sort_strings->{$index_entry}
+ = join(', ', @entry_sort_strings);
}
$index_sortable_index_entries->{$index_name} = $sortable_index_entries;
}
diff --git a/tp/Texinfo/XS/configure.ac b/tp/Texinfo/XS/configure.ac
index 9e271fda20..495d13a905 100644
--- a/tp/Texinfo/XS/configure.ac
+++ b/tp/Texinfo/XS/configure.ac
@@ -175,6 +175,8 @@ esac
AC_SUBST([perl_conf_CFLAGS], [$perl_conf_CFLAGS])
AC_SUBST([perl_conf_LDFLAGS], [$perl_conf_LDFLAGS])
+AC_CHECK_FUNCS(newlocale strxfrm_l)
+
# Output these with the _CONFIG suffix as we use the originals as names
# of customization variables.
AC_DEFINE_UNQUOTED([PACKAGE_CONFIG], ["$PACKAGE"],
diff --git a/tp/Texinfo/XS/convert/ConvertXS.xs
b/tp/Texinfo/XS/convert/ConvertXS.xs
index f3e152e685..750ed03878 100644
--- a/tp/Texinfo/XS/convert/ConvertXS.xs
+++ b/tp/Texinfo/XS/convert/ConvertXS.xs
@@ -1699,6 +1699,16 @@ html_merge_index_entries (SV *converter_in)
if (self)
html_merge_index_entries (self);
+void
+html_sort_index_entries (SV *converter_in)
+ PREINIT:
+ CONVERTER *self;
+ CODE:
+ self = get_sv_converter (converter_in,
+ "html_sort_index_entries");
+ if (self)
+ html_sort_index_entries (self);
+
void
reset_output_init_conf (SV *sv_in)
diff --git a/tp/Texinfo/XS/convert/convert_html.c
b/tp/Texinfo/XS/convert/convert_html.c
index 60e29808b1..f9baefb3ee 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -4680,6 +4680,20 @@ html_merge_index_entries (CONVERTER *self)
}
}
+void
+html_sort_index_entries (CONVERTER *self)
+{
+ html_merge_index_entries (self);
+
+ if (self->document->index_names)
+ {
+ self->index_entries_by_letter
+ = sort_indices_by_letter (&self->error_messages, self->conf,
+ self->index_entries,
+ self->document->index_names);
+ }
+}
+
int
compare_index_name (const void *a, const void *b)
{
diff --git a/tp/Texinfo/XS/convert/convert_html.h
b/tp/Texinfo/XS/convert/convert_html.h
index 206acdd78b..663510c363 100644
--- a/tp/Texinfo/XS/convert/convert_html.h
+++ b/tp/Texinfo/XS/convert/convert_html.h
@@ -169,6 +169,7 @@ size_t html_check_htmlxref_already_warned (CONVERTER *self,
const SOURCE_INFO *source_info);
void html_merge_index_entries (CONVERTER *self);
+void html_sort_index_entries (CONVERTER *self);
void html_prepare_conversion_units (CONVERTER *self,
int *output_units_descriptor_ref,
diff --git a/tp/Texinfo/XS/convert/indices_in_conversion.c
b/tp/Texinfo/XS/convert/indices_in_conversion.c
index 4a451d2f63..271d93daca 100644
--- a/tp/Texinfo/XS/convert/indices_in_conversion.c
+++ b/tp/Texinfo/XS/convert/indices_in_conversion.c
@@ -17,6 +17,7 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <locale.h>
#include "uniconv.h"
#include "unictype.h"
#include "unistr.h"
@@ -25,8 +26,11 @@
#include "converter_types.h"
#include "utils.h"
#include "extra.h"
+#include "errors.h"
+#include "debug.h"
#include "unicode.h"
#include "convert_to_text.h"
+#include "convert_to_texinfo.h"
#include "indices_in_conversion.h"
/* corresponding perl code in Texinfo::Indices */
@@ -222,3 +226,569 @@ index_entry_element_sort_string (INDEX_ENTRY *main_entry,
}
return sort_string;
}
+
+static void
+set_sort_key (locale_t collation_locale, const char *input_string,
+ char **result_key)
+{
+ if (collation_locale)
+ {
+ #ifdef HAVE_STRXFRM_L
+ size_t len = strxfrm_l (0, input_string, 0, collation_locale);
+ size_t check_len;
+
+ *result_key
+ = (char *) malloc ((len +1) * sizeof (char));
+ check_len = strxfrm_l (*result_key, input_string, len+1,
+ collation_locale);
+ if (check_len != len)
+ fatal ("strxfrm_l returns a different length");
+ #endif
+ }
+ else
+ *result_key = strdup (input_string);
+}
+
+typedef struct INDEX_SORT_STRING_KEY {
+ char *sort_string;
+ char *sort_key;
+} INDEX_SORT_STRING_KEY;
+
+static INDEX_SORT_STRING_KEY *
+index_entry_element_sort_string_key (INDEX_ENTRY *main_entry,
+ ELEMENT *index_entry_element,
+ TEXT_OPTIONS *options, int in_code,
+ locale_t collation_locale,
+ int prefer_reference_element)
+{
+ INDEX_SORT_STRING_KEY *sort_string_key = (INDEX_SORT_STRING_KEY *)
+ malloc (sizeof (INDEX_SORT_STRING_KEY));
+ sort_string_key->sort_string = index_entry_element_sort_string (main_entry,
+ index_entry_element, options, in_code,
+ prefer_reference_element);
+ set_sort_key (collation_locale, sort_string_key->sort_string,
+ &sort_string_key->sort_key);
+
+ return sort_string_key;
+}
+
+INDICES_SORTABLE_ENTRIES *
+setup_sortable_index_entries (ERROR_MESSAGE_LIST *error_messages,
+ OPTIONS *options, MERGED_INDEX *index_entries,
+ INDEX **indices_information, locale_t *collation_locale)
+{
+ size_t indices_nr;
+ size_t i;
+ TEXT_OPTIONS *convert_text_options
+ = setup_index_entry_keys_formatting (options);
+
+ /* similar to using Unicode::Collate in Perl */
+ *collation_locale = 0;
+
+ #ifdef HAVE_STRXFRM_L
+ #ifdef HAVE_NEWLOCALE
+ if (!options->USE_UNICODE_COLLATION.integer == 0)
+ *collation_locale = newlocale (LC_COLLATE_MASK, "en_US.utf-8", 0);
+ #endif
+ #endif
+
+ for (indices_nr = 0; index_entries[indices_nr].name; indices_nr++)
+ {}
+
+ if (indices_nr == 0)
+ return 0;
+
+ INDICES_SORTABLE_ENTRIES *indices_sortable_entries
+ = (INDICES_SORTABLE_ENTRIES *) malloc (sizeof (INDICES_SORTABLE_ENTRIES));
+
+ indices_sortable_entries->number = indices_nr;
+ indices_sortable_entries->indices = (INDEX_SORTABLE_ENTRIES *)
+ malloc (indices_nr * sizeof (INDEX_SORTABLE_ENTRIES));
+ memset (indices_sortable_entries->indices, 0,
+ indices_nr * sizeof (INDEX_SORTABLE_ENTRIES));
+
+ for (i = 0; i < indices_nr; i++)
+ {
+ MERGED_INDEX *index = &index_entries[i];
+ INDEX_SORTABLE_ENTRIES *sortable_index_entries
+ = &indices_sortable_entries->indices[i];
+ if (index->entries_number > 0)
+ {
+ size_t j;
+
+ sortable_index_entries->index = index;
+ sortable_index_entries->number = index->entries_number;
+ sortable_index_entries->sortable_entries = (SORTABLE_INDEX_ENTRY *)
+ malloc (index->entries_number * sizeof (SORTABLE_INDEX_ENTRY));
+
+ for (j = 0; j < index->entries_number; j++)
+ {
+ int non_empty_index_subentries = 0;
+ SORTABLE_INDEX_SUBENTRY *sortable_subentry;
+ INDEX_ENTRY *index_entry = &index->index_entries[j];
+
+ ELEMENT *main_entry_element = index_entry->entry_element;
+ ELEMENT *subentry = main_entry_element;
+
+ INDEX *entry_index
+ = indices_info_index_by_name (indices_information,
+ index_entry->index_name);
+
+ INDEX_SORT_STRING_KEY *sort_string_key
+ = index_entry_element_sort_string_key (index_entry,
+ main_entry_element, convert_text_options,
+ entry_index->in_code, *collation_locale, 0);
+
+ SORTABLE_INDEX_ENTRY *sortable_entry
+ = &sortable_index_entries->sortable_entries[j];
+
+ sortable_entry->entry = index_entry;
+ sortable_entry->subentries_number = 1;
+ sortable_entry->sortable_subentries = (SORTABLE_INDEX_SUBENTRY *)
+ malloc (sizeof (SORTABLE_INDEX_SUBENTRY));
+
+ sortable_subentry = &sortable_entry->sortable_subentries[0];
+
+ if (sort_string_key->sort_string[strspn
+ (sort_string_key->sort_string, whitespace_chars)] == '\0')
+ {
+ const char *entry_cmdname;
+
+ sortable_subentry->sort_string = strdup("");
+ sortable_subentry->sort_key = strdup("");
+ free (sort_string_key->sort_string);
+ free (sort_string_key->sort_key);
+
+ entry_cmdname = element_command_name (main_entry_element);
+ if (!entry_cmdname)
+ {
+ entry_cmdname = lookup_extra_string (main_entry_element,
+ "original_def_cmdname");
+ }
+
+ message_list_command_warn (error_messages, options,
+ main_entry_element,
+ "empty index key in @%s", entry_cmdname);
+ }
+ else
+ {
+ sortable_subentry->sort_string
+ = sort_string_key->sort_string;
+ sortable_subentry->sort_key = sort_string_key->sort_key;
+ non_empty_index_subentries++;
+ }
+ free (sort_string_key);
+
+ while (1)
+ {
+ ELEMENT *next_subentry = lookup_extra_element (subentry,
+ "subentry");
+ if (!next_subentry)
+ break;
+
+ subentry = next_subentry;
+ sortable_entry->subentries_number++;
+
+ sortable_entry->sortable_subentries
+ = (SORTABLE_INDEX_SUBENTRY *)
+ realloc (sortable_entry->sortable_subentries,
+ sizeof (SORTABLE_INDEX_SUBENTRY)
+ * sortable_entry->subentries_number);
+ if (!sortable_entry->sortable_subentries)
+ fatal ("realloc failed");
+
+ sortable_subentry
+ = &sortable_entry->sortable_subentries[
+ sortable_entry->subentries_number -1];
+
+ sort_string_key
+ = index_entry_element_sort_string_key (index_entry,
+ subentry, convert_text_options,
+ entry_index->in_code, *collation_locale, 0);
+
+ if (sort_string_key->sort_string[strspn
+ (sort_string_key->sort_string, whitespace_chars)] == '\0')
+ {
+ const char *entry_cmdname;
+
+ sortable_subentry->sort_string = strdup("");
+ sortable_subentry->sort_key = strdup("");
+ free (sort_string_key->sort_string);
+ free (sort_string_key->sort_key);
+
+ entry_cmdname = element_command_name
(main_entry_element);
+ if (!entry_cmdname)
+ {
+ entry_cmdname
+ = lookup_extra_string (main_entry_element,
+ "original_def_cmdname");
+ }
+
+ message_list_command_warn (error_messages, options,
+ main_entry_element,
+ "empty index sub entry %zu key in @%s",
+ sortable_entry->subentries_number -1,
+ entry_cmdname);
+ }
+ else
+ {
+ sortable_subentry->sort_string
+ = sort_string_key->sort_string;
+ sortable_subentry->sort_key = sort_string_key->sort_key;
+ non_empty_index_subentries++;
+ }
+ free (sort_string_key);
+ }
+ if (non_empty_index_subentries > 0)
+ {
+ int k;
+ for (k = 0; k < sortable_entry->subentries_number; k++)
+ {
+ uint8_t *encoded_u8;
+ ucs4_t next_char;
+ int new_len;
+
+ sortable_subentry
+ = &sortable_entry->sortable_subentries[k];
+ /* TODO quite inefficient, only need the first character */
+ encoded_u8
+ = u8_strconv_from_encoding (
+ sortable_subentry->sort_string,
+ "UTF-8", iconveh_question_mark);
+ new_len = u8_strmbtouc (&next_char, encoded_u8);
+ if (new_len > 0
+ && uc_is_property (next_char,
UC_PROPERTY_ALPHABETIC))
+ sortable_subentry->alpha = 1;
+ else
+ sortable_subentry->alpha = 0;
+
+ free (encoded_u8);
+ }
+ }
+ }
+ }
+ }
+ return indices_sortable_entries;
+}
+
+typedef struct LETTER_SORTABLE_ENTRIES {
+ char *letter;
+ char *letter_sort_key;
+ size_t space;
+ size_t number;
+ SORTABLE_INDEX_ENTRY **sortable_entries;
+} LETTER_SORTABLE_ENTRIES;
+
+typedef struct INDEX_LETTERS_SORTABLE_ENTRIES {
+ size_t letter_number;
+ size_t space;
+ LETTER_SORTABLE_ENTRIES *letter_entries;
+} INDEX_LETTERS_SORTABLE_ENTRIES;
+
+static int
+compare_index_letter (const void *a, const void *b)
+{
+ const LETTER_SORTABLE_ENTRIES *lse_a = (const LETTER_SORTABLE_ENTRIES *) a;
+ const LETTER_SORTABLE_ENTRIES *lse_b = (const LETTER_SORTABLE_ENTRIES *) b;
+
+ return strcmp (lse_a->letter_sort_key, lse_b->letter_sort_key);
+}
+
+static int
+compare_sortable_subentry_keys (const void *a, const void *b)
+{
+ const SORTABLE_INDEX_SUBENTRY *sis_a = (const SORTABLE_INDEX_SUBENTRY *) a;
+ const SORTABLE_INDEX_SUBENTRY *sis_b = (const SORTABLE_INDEX_SUBENTRY *) b;
+
+ if (sis_a->alpha == sis_b->alpha)
+ return strcmp (sis_a->sort_key, sis_b->sort_key);
+
+ return (sis_a->alpha > sis_b->alpha) - (sis_a->alpha < sis_b->alpha);
+}
+
+static int
+compare_sortable_index_entry (const void *a, const void *b)
+{
+ const SORTABLE_INDEX_ENTRY **psie_a = (const SORTABLE_INDEX_ENTRY **) a;
+ const SORTABLE_INDEX_ENTRY **psie_b = (const SORTABLE_INDEX_ENTRY **) b;
+ const SORTABLE_INDEX_ENTRY *sie_a = *psie_a;
+ const SORTABLE_INDEX_ENTRY *sie_b = *psie_b;
+
+ size_t i;
+ int res;
+
+ /* corresponds to the main entry and subentries */
+ for (i = 0; i < sie_a->subentries_number; i++)
+ {
+ SORTABLE_INDEX_SUBENTRY *sub_key_a = &sie_a->sortable_subentries[i];
+ SORTABLE_INDEX_SUBENTRY *sub_key_b = &sie_b->sortable_subentries[i];
+
+ res = compare_sortable_subentry_keys (sub_key_a, sub_key_b);
+
+ if (res != 0)
+ return res;
+
+ if (i + 1 >= sie_b->subentries_number)
+ break;
+ }
+ res = (sie_a->subentries_number > sie_b->subentries_number)
+ - (sie_a->subentries_number < sie_b->subentries_number);
+
+ if (res != 0)
+ return res;
+
+ res = (sie_a->entry->number > sie_b->entry->number)
+ - (sie_a->entry->number < sie_b->entry->number);
+
+ if (res != 0)
+ return res;
+
+ /* This may happen if 2 indices are merged as the number is per
+ index name. */
+
+ return strcmp (sie_a->entry->index_name, sie_b->entry->index_name);
+}
+
+INDEX_SORTED_BY_LETTER *
+sort_indices_by_letter (ERROR_MESSAGE_LIST *error_messages,
+ OPTIONS *options, MERGED_INDEX *index_entries,
+ INDEX **indices_information)
+{
+ size_t i;
+ int index_nr = 0;
+ locale_t collation_locale = 0;
+ static INDEX_LETTERS_SORTABLE_ENTRIES index_letters_sortable_entries;
+
+ INDICES_SORTABLE_ENTRIES *indices_sortable_entries
+ = setup_sortable_index_entries (error_messages, options, index_entries,
+ indices_information, &collation_locale);
+
+ if (!indices_sortable_entries || indices_sortable_entries->number <= 0)
+ return 0;
+
+ INDEX_SORTED_BY_LETTER *sorted_index_entries
+ = (INDEX_SORTED_BY_LETTER *) malloc
+ ((indices_sortable_entries->number +1) * sizeof (INDEX_SORTED_BY_LETTER));
+
+ for (i = 0; i < indices_sortable_entries->number; i++)
+ {
+ size_t j;
+ INDEX_SORTABLE_ENTRIES *sortable_index_entries
+ = &indices_sortable_entries->indices[i];
+ INDEX_SORTED_BY_LETTER *index_sorted;
+
+ if (sortable_index_entries->number <= 0)
+ continue;
+
+ index_sorted = &sorted_index_entries[index_nr];
+ index_sorted->name = strdup (sortable_index_entries->index->name);
+
+ for (j = 0; j < sortable_index_entries->number; j++)
+ {
+ int len = 0;
+ SORTABLE_INDEX_ENTRY *sortable_entry
+ = &sortable_index_entries->sortable_entries[j];
+ char *sort_string
+ = sortable_entry->sortable_subentries[0].sort_string;
+ uint8_t *encoded_u8 = u8_strconv_from_encoding (sort_string, "UTF-8",
+ iconveh_question_mark);
+ uint8_t *current_u8 = encoded_u8;
+ char *letter_string;
+ char *upper_letter_string;
+ char *norm_letter_string;
+ TEXT letter_text;
+ char *letter_sort_key;
+ int letter_added = 0;
+
+ LETTER_SORTABLE_ENTRIES *letter_sortable_entries = 0;
+
+ while (1)
+ {
+ ucs4_t next_char;
+ int new_len = u8_strmbtouc (&next_char, current_u8);
+ if (!new_len)
+ break;
+
+ len += new_len;
+ /* UC_NON_SPACING_MARK is the same as UC_CATEGORY_Mn */
+ if (uc_is_general_category (next_char, UC_NON_SPACING_MARK))
+ {
+ current_u8 += new_len;
+ }
+ else
+ {
+ break;
+ }
+ }
+ free (encoded_u8);
+
+ letter_string = strndup (sort_string, len);
+ upper_letter_string = to_upper_or_lower_multibyte (letter_string, 1);
+ free (letter_string);
+ norm_letter_string = normalize_NFKD (upper_letter_string);
+ free (upper_letter_string);
+ encoded_u8 = u8_strconv_from_encoding (norm_letter_string, "UTF-8",
+ iconveh_question_mark);
+ current_u8 = encoded_u8;
+
+ text_init (&letter_text);
+ /* we want an empty string if the input string is empty */
+ text_append (&letter_text, "");
+
+ while (1)
+ {
+ ucs4_t next_char;
+ int new_len = u8_strmbtouc (&next_char, current_u8);
+ if (!new_len)
+ break;
+
+ if (!uc_is_general_category (next_char, UC_NON_SPACING_MARK))
+ {
+ char *first_char_text;
+ uint8_t *first_char_u8 = malloc (7 * sizeof(uint8_t));
+ int first_char_len = u8_uctomb (first_char_u8, next_char, 6);
+ if (first_char_len < 0)
+ fatal ("u8_uctomb returns negative value");
+ first_char_u8[first_char_len] = 0;
+ first_char_text = u8_strconv_to_encoding (first_char_u8,
+ "UTF-8", iconveh_question_mark);
+ free (first_char_u8);
+ text_append (&letter_text, first_char_text);
+ free (first_char_text);
+ }
+ current_u8 += new_len;
+ }
+ free (encoded_u8);
+
+ set_sort_key (collation_locale, letter_text.text,
+ &letter_sort_key);
+
+ if (index_letters_sortable_entries.letter_number > 0)
+ {
+ /* used to find an existing letter */
+ static LETTER_SORTABLE_ENTRIES compared_letter_sortable;
+
+ compared_letter_sortable.letter_sort_key = letter_sort_key;
+
+ letter_sortable_entries = (LETTER_SORTABLE_ENTRIES *)
+ bsearch (&compared_letter_sortable,
+ index_letters_sortable_entries.letter_entries,
+ index_letters_sortable_entries.letter_number,
+ sizeof (LETTER_SORTABLE_ENTRIES),
+ compare_index_letter);
+ }
+
+ if (!letter_sortable_entries)
+ {
+ if (index_letters_sortable_entries.letter_number
+ >= index_letters_sortable_entries.space)
+ {
+ index_letters_sortable_entries.letter_entries
+ = realloc (index_letters_sortable_entries.letter_entries,
+ (index_letters_sortable_entries.space += 5)
+ * sizeof (LETTER_SORTABLE_ENTRIES));
+ if (!index_letters_sortable_entries.letter_entries)
+ fatal ("realloc failed");
+ }
+ letter_sortable_entries
+ = &index_letters_sortable_entries.letter_entries[
+ index_letters_sortable_entries.letter_number];
+
+ letter_sortable_entries->space = 0;
+ letter_sortable_entries->number = 0;
+ letter_sortable_entries->sortable_entries = 0;
+ letter_sortable_entries->letter = letter_text.text;
+ letter_sortable_entries->letter_sort_key = letter_sort_key;
+
+ index_letters_sortable_entries.letter_number++;
+
+ letter_added = 1;
+ }
+ else
+ {
+ free (letter_text.text);
+ free (letter_sort_key);
+ }
+
+ /* now add the sortable entry to the letter sortable entries */
+ if (letter_sortable_entries->number
+ >= letter_sortable_entries->space)
+ {
+ letter_sortable_entries->sortable_entries
+ = realloc (letter_sortable_entries->sortable_entries,
+ (letter_sortable_entries->space += 5)
+ * sizeof (SORTABLE_INDEX_ENTRY));
+ if (!letter_sortable_entries->sortable_entries)
+ fatal ("realloc failed");
+ }
+ letter_sortable_entries->sortable_entries[
+ letter_sortable_entries->number] = sortable_entry;
+
+ letter_sortable_entries->number++;
+ /* sort after using letter_sortable_entries, as its address may be
+ changed by the sort */
+ if (index_letters_sortable_entries.letter_number > 1
+ && letter_added)
+ {
+ /* re-sort letters with the new letter added */
+ if (index_letters_sortable_entries.letter_number > 1)
+ qsort (index_letters_sortable_entries.letter_entries,
+ index_letters_sortable_entries.letter_number,
+ sizeof (LETTER_SORTABLE_ENTRIES),
+ compare_index_letter);
+ }
+ }
+
+ /* sort the index entries by letter */
+ /* also reset for the next index */
+
+ if (index_letters_sortable_entries.letter_number > 0)
+ {
+ index_sorted->letter_number
+ = index_letters_sortable_entries.letter_number;
+ index_sorted->letter_entries = (LETTER_INDEX_ENTRIES *) malloc
+ (index_sorted->letter_number * sizeof (LETTER_INDEX_ENTRIES));
+
+ for (j = 0; j < index_letters_sortable_entries.letter_number; j++)
+ {
+ size_t k;
+ LETTER_SORTABLE_ENTRIES *letter_sortable_entries
+ = &index_letters_sortable_entries.letter_entries[j];
+ LETTER_INDEX_ENTRIES *letter_index_entries
+ = &index_sorted->letter_entries[j];
+
+ qsort (letter_sortable_entries->sortable_entries,
+ letter_sortable_entries->number,
+ sizeof (SORTABLE_INDEX_ENTRY *),
+ compare_sortable_index_entry);
+
+ letter_index_entries->letter = letter_sortable_entries->letter;
+ letter_index_entries->entries = (INDEX_ENTRY **) malloc
+ (letter_sortable_entries->number * sizeof (INDEX_ENTRY *));
+ letter_index_entries->entries_number
+ = letter_sortable_entries->number;
+
+ /* set index entries in the sorted order */
+ for (k = 0; k < letter_sortable_entries->number; k++)
+ {
+ letter_index_entries->entries[k]
+ = letter_sortable_entries->sortable_entries[k]->entry;
+ }
+
+ /* TODO should we reuse the letters instead? */
+ free (letter_sortable_entries->letter_sort_key);
+ free (letter_sortable_entries->sortable_entries);
+ }
+ /* TODO should we reuse the letters instead? */
+ index_letters_sortable_entries.letter_number = 0;
+ }
+ index_nr++;
+ }
+
+ memset (&sorted_index_entries[index_nr], 0, sizeof (INDEX_SORTED_BY_LETTER));
+ if (index_nr < indices_sortable_entries->number)
+ sorted_index_entries = realloc (sorted_index_entries,
+ (index_nr+1) * sizeof (INDEX_SORTED_BY_LETTER));
+
+ return sorted_index_entries;
+}
diff --git a/tp/Texinfo/XS/convert/indices_in_conversion.h
b/tp/Texinfo/XS/convert/indices_in_conversion.h
index 71787d165d..f69142c7cf 100644
--- a/tp/Texinfo/XS/convert/indices_in_conversion.h
+++ b/tp/Texinfo/XS/convert/indices_in_conversion.h
@@ -5,6 +5,32 @@
#include "tree_types.h"
#include "convert_to_text.h"
+typedef struct SORTABLE_INDEX_SUBENTRY {
+ char *sort_string;
+ char *sort_key;
+ int alpha;
+} SORTABLE_INDEX_SUBENTRY;
+
+typedef struct SORTABLE_INDEX_ENTRY {
+ INDEX_ENTRY *entry;
+ /* in perl 'index_name' => $index_entry->{'index_name'} */
+ /* in perl 'number' => $index_entry->{'entry_number'} */
+ size_t subentries_number;
+ /* both entry_keys and keys in perl */
+ SORTABLE_INDEX_SUBENTRY *sortable_subentries;
+} SORTABLE_INDEX_ENTRY;
+
+typedef struct INDEX_SORTABLE_ENTRIES {
+ MERGED_INDEX *index;
+ size_t number;
+ SORTABLE_INDEX_ENTRY *sortable_entries;
+} INDEX_SORTABLE_ENTRIES;
+
+typedef struct INDICES_SORTABLE_ENTRIES {
+ size_t number;
+ INDEX_SORTABLE_ENTRIES *indices;
+} INDICES_SORTABLE_ENTRIES;
+
MERGED_INDEX *merge_indices (INDEX **index_names);
void destroy_merged_indices (MERGED_INDEX *merged_indices);
@@ -18,4 +44,9 @@ char *index_entry_element_sort_string (INDEX_ENTRY
*main_entry,
ELEMENT *index_entry_element,
TEXT_OPTIONS *options, int in_code,
int prefer_reference_element);
+
+INDEX_SORTED_BY_LETTER *sort_indices_by_letter (
+ ERROR_MESSAGE_LIST *error_messages,
+ OPTIONS *options, MERGED_INDEX *index_entries,
+ INDEX **indices_information);
#endif
diff --git a/tp/Texinfo/XS/main/tree_types.h b/tp/Texinfo/XS/main/tree_types.h
index 9cef300923..fb6a56fb4c 100644
--- a/tp/Texinfo/XS/main/tree_types.h
+++ b/tp/Texinfo/XS/main/tree_types.h
@@ -237,8 +237,10 @@ typedef struct IGNORED_CHARS {
int atsign;
} IGNORED_CHARS;
-typedef struct {
+typedef struct INDEX_ENTRY {
char *index_name; /* kept with the entry as the indices may be merged */
+ int number; /* position in the original index. May be different in
+ merged index */
ELEMENT *entry_element;
ELEMENT *entry_associated_element; /* set if the entry is reassociated to
another element */
diff --git a/tp/Texinfo/XS/main/utils.c b/tp/Texinfo/XS/main/utils.c
index 39e6825788..70813e2543 100644
--- a/tp/Texinfo/XS/main/utils.c
+++ b/tp/Texinfo/XS/main/utils.c
@@ -262,6 +262,10 @@ word_bytes_len_multibyte (const char *text)
{
break;
}
+ /* Nd: Number, decimal digit
+ M: Mark
+ Pc: Punctuation, connector
+ */
/* (\p{Alnum} = \p{Alphabetic} + \p{Nd}) + \pM + \p{Pc}
+ \p{Join_Control} */
if (uc_is_general_category (next_char, UC_CATEGORY_M)
diff --git a/tp/Texinfo/XS/parsetexi/indices.c
b/tp/Texinfo/XS/parsetexi/indices.c
index 25319fcf06..49535401ff 100644
--- a/tp/Texinfo/XS/parsetexi/indices.c
+++ b/tp/Texinfo/XS/parsetexi/indices.c
@@ -264,9 +264,9 @@ enter_index_entry (enum command_id index_type_cmd,
memset (entry, 0, sizeof (INDEX_ENTRY));
entry->index_name = idx->name;
- /* not needed, the position in the index is directly used
+ /* not needed in the parser, the position in the index is directly used.
+ Used for sorting */
entry->number = idx->entries_number;
- */
entry->entry_element = element;
/* Create ignored_chars string. */
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: Index sorting in C,
Patrice Dumas <=