texinfo-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

branch master updated: Index sorting in C


From: Patrice Dumas
Subject: branch master updated: Index sorting in C
Date: Tue, 30 Jan 2024 16:53:43 -0500

This is an automated email from the git hooks/post-receive script.

pertusus pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new a7fb50d574 Index sorting in C
a7fb50d574 is described below

commit a7fb50d574c034bd0b8c23520ccd917c5b23fae5
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Tue Jan 30 22:46:28 2024 +0100

    Index sorting in C
    
    * tp/Texinfo/Indices.pm (setup_sortable_index_entries): rename
    variables.  Simplify code that determine that there is one non empty
    entry/subentry.
    
    * tp/Texinfo/XS/parsetexi/indices.c (enter_index_entry),
    tp/Texinfo/XS/main/tree_types.h (INDEX_ENTRY): add index
    entry number in index entry structure.
    
    * tp/Texinfo/XS/convert/indices_in_conversion.c (set_sort_key)
    (INDEX_SORT_STRING_KEY, index_entry_element_sort_string_key)
    (setup_sortable_index_entries, LETTER_SORTABLE_ENTRIES)
    (INDEX_LETTERS_SORTABLE_ENTRIES, compare_index_letter)
    (compare_sortable_subentry_keys, compare_sortable_index_entry)
    (sort_indices_by_letter),
    tp/Texinfo/XS/convert/indices_in_conversion.h
    (SORTABLE_INDEX_SUBENTRY, SORTABLE_INDEX_ENTRY)
    (INDEX_SORTABLE_ENTRIES, INDICES_SORTABLE_ENTRIES): implement
    index sorting by letter in C.
    * tp/Texinfo/XS/configure.ac: check for newlocale and strxfrm_l.
    
    * tp/Texinfo/Convert/HTML.pm (%XS_conversion_overrides)
    (_NonXS_sort_index_entries, _XS_only_sort_index_entries)
    (_sort_index_entries), tp/Texinfo/XS/convert/ConvertXS.xs
    (html_sort_index_entries), tp/Texinfo/XS/convert/convert_html.c
    (html_sort_index_entries): XS interface for index sorting in HTML,
    if TEST is not set.
---
 ChangeLog                                     |  31 ++
 tp/Texinfo/Convert/HTML.pm                    |  22 +-
 tp/Texinfo/Indices.pm                         |  65 +--
 tp/Texinfo/XS/configure.ac                    |   2 +
 tp/Texinfo/XS/convert/ConvertXS.xs            |  10 +
 tp/Texinfo/XS/convert/convert_html.c          |  14 +
 tp/Texinfo/XS/convert/convert_html.h          |   1 +
 tp/Texinfo/XS/convert/indices_in_conversion.c | 570 ++++++++++++++++++++++++++
 tp/Texinfo/XS/convert/indices_in_conversion.h |  31 ++
 tp/Texinfo/XS/main/tree_types.h               |   4 +-
 tp/Texinfo/XS/main/utils.c                    |   4 +
 tp/Texinfo/XS/parsetexi/indices.c             |   4 +-
 12 files changed, 722 insertions(+), 36 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 8e893ef4b6..da4abe7e33 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,34 @@
+2024-01-30  Patrice Dumas  <pertusus@free.fr>
+
+       Index sorting in C
+
+       * tp/Texinfo/Indices.pm (setup_sortable_index_entries): rename
+       variables.  Simplify code that determine that there is one non empty
+       entry/subentry.
+
+       * tp/Texinfo/XS/parsetexi/indices.c (enter_index_entry),
+       tp/Texinfo/XS/main/tree_types.h (INDEX_ENTRY): add index
+       entry number in index entry structure.
+
+       * tp/Texinfo/XS/convert/indices_in_conversion.c (set_sort_key)
+       (INDEX_SORT_STRING_KEY, index_entry_element_sort_string_key)
+       (setup_sortable_index_entries, LETTER_SORTABLE_ENTRIES)
+       (INDEX_LETTERS_SORTABLE_ENTRIES, compare_index_letter)
+       (compare_sortable_subentry_keys, compare_sortable_index_entry)
+       (sort_indices_by_letter),
+       tp/Texinfo/XS/convert/indices_in_conversion.h
+       (SORTABLE_INDEX_SUBENTRY, SORTABLE_INDEX_ENTRY)
+       (INDEX_SORTABLE_ENTRIES, INDICES_SORTABLE_ENTRIES): implement
+       index sorting by letter in C.
+       * tp/Texinfo/XS/configure.ac: check for newlocale and strxfrm_l.
+
+       * tp/Texinfo/Convert/HTML.pm (%XS_conversion_overrides)
+       (_NonXS_sort_index_entries, _XS_only_sort_index_entries)
+       (_sort_index_entries), tp/Texinfo/XS/convert/ConvertXS.xs
+       (html_sort_index_entries), tp/Texinfo/XS/convert/convert_html.c
+       (html_sort_index_entries): XS interface for index sorting in HTML,
+       if TEST is not set.
+
 2024-01-29  Gavin Smith <gavinsmith0123@gmail.com>
 
        * doc/texinfo.texi (Other Customization Variables),
diff --git a/tp/Texinfo/Convert/HTML.pm b/tp/Texinfo/Convert/HTML.pm
index 5688bf9c25..ccbc2473ad 100644
--- a/tp/Texinfo/Convert/HTML.pm
+++ b/tp/Texinfo/Convert/HTML.pm
@@ -267,6 +267,8 @@ my %XS_conversion_overrides = (
    => "Texinfo::Convert::ConvertXS::get_index_entries_sorted_by_letter",
   "Texinfo::Convert::HTML::_XS_html_merge_index_entries"
    => "Texinfo::Convert::ConvertXS::html_merge_index_entries",
+  "Texinfo::Convert::HTML::_XS_only_sort_index_entries"
+   => "Texinfo::Convert::ConvertXS::html_sort_index_entries",
   "Texinfo::Convert::HTML::_prepare_conversion_units"
    => "Texinfo::Convert::ConvertXS::html_prepare_conversion_units",
   "Texinfo::Convert::HTML::_prepare_units_directions_files"
@@ -10281,7 +10283,7 @@ sub _XS_html_merge_index_entries($)
 {
 }
 
-sub _sort_index_entries($)
+sub _NonXS_sort_index_entries($)
 {
   my $self = shift;
 
@@ -10311,6 +10313,24 @@ sub _sort_index_entries($)
   }
 }
 
+sub _XS_only_sort_index_entries($)
+{
+  my $self = shift;
+  _NonXS_sort_index_entries($self);
+}
+
+sub _sort_index_entries($)
+{
+  my $self = shift;
+
+  # Sorting in Perl to have a reproducible output for tests
+  if ($self->get_conf('TEST')) {
+    _NonXS_sort_index_entries($self);
+  } else {
+    _XS_only_sort_index_entries($self);
+  }
+}
+
 sub _prepare_index_entries_targets($)
 {
   my $self = shift;
diff --git a/tp/Texinfo/Indices.pm b/tp/Texinfo/Indices.pm
index b9322a1399..c1be210186 100644
--- a/tp/Texinfo/Indices.pm
+++ b/tp/Texinfo/Indices.pm
@@ -171,7 +171,7 @@ sub _collator_sort_index_entries($$$)
   my $collator = shift;
 
   my $key_index = 0;
-  # the keys array corresponds to th emain entry and subentries
+  # the keys array corresponds to the main entry and subentries
   foreach my $key1_str (@{$key1->{'keys'}}) {
     my $res = _collator_sort_key($key1_str,
                                     $key2->{'keys'}->[$key_index],
@@ -422,13 +422,14 @@ sub setup_sortable_index_entries($$$$$)
       if ($in_code) {
         Texinfo::Convert::Text::set_options_code($convert_text_options);
       }
-      my ($entry_key, $sort_entry_key)
+      my ($entry_sort_string, $entry_sort_key)
         = _index_entry_element_sort_string_key($customization_information,
                                    $index_entry, $main_entry_element,
                                    $convert_text_options, $entries_collator);
-      my @entry_keys;
-      my @sort_entry_keys;
-      if ($entry_key !~ /\S/) {
+      my $non_empty_index_subentries = 0;
+      my @entry_sort_strings;
+      my @entry_sort_keys;
+      if ($entry_sort_string !~ /\S/) {
         my $entry_cmdname = $main_entry_element->{'cmdname'};
         $entry_cmdname
           = $main_entry_element->{'extra'}->{'original_def_cmdname'}
@@ -438,22 +439,23 @@ sub setup_sortable_index_entries($$$$$)
                        sprintf(__("empty index key in \@%s"),
                                   $entry_cmdname),
                                $main_entry_element->{'source_info'});
-        push @entry_keys, '';
-        push @sort_entry_keys, '';
+        push @entry_sort_strings, '';
+        push @entry_sort_keys, '';
       } else {
-        push @entry_keys, $entry_key;
-        push @sort_entry_keys, $sort_entry_key;
+        push @entry_sort_strings, $entry_sort_string;
+        push @entry_sort_keys, $entry_sort_key;
+        $non_empty_index_subentries++;
       }
       my $subentry_nr = 0;
       my $subentry = $main_entry_element;
       while ($subentry->{'extra'} and $subentry->{'extra'}->{'subentry'}) {
         $subentry_nr ++;
         $subentry = $subentry->{'extra'}->{'subentry'};
-        my ($subentry_key, $sort_subentry_key)
+        my ($subentry_sort_string, $sort_subentry_key)
               = 
_index_entry_element_sort_string_key($customization_information,
                              $index_entry, $subentry, $convert_text_options,
                                 $entries_collator);
-        if ($subentry_key !~ /\S/) {
+        if ($subentry_sort_string !~ /\S/) {
           my $entry_cmdname = $main_entry_element->{'cmdname'};
           $entry_cmdname
             = $main_entry_element->{'extra'}->{'original_def_cmdname'}
@@ -463,36 +465,35 @@ sub setup_sortable_index_entries($$$$$)
                          sprintf(__("empty index sub entry %d key in \@%s"),
                                     $subentry_nr, $entry_cmdname),
                                   $main_entry_element->{'source_info'});
-          push @entry_keys, '';
-          push @sort_entry_keys, '';
+          push @entry_sort_strings, '';
+          push @entry_sort_keys, '';
         } else {
-          push @entry_keys, $subentry_key;
-          push @sort_entry_keys, $sort_subentry_key;
+          push @entry_sort_strings, $subentry_sort_string;
+          push @entry_sort_keys, $sort_subentry_key;
+          $non_empty_index_subentries++;
         }
       }
-      foreach my $sub_entry_key (@sort_entry_keys) {
-        if ($sub_entry_key ne '') {
-          my @keys_and_alpha;
-          for (my $i = 0; $i < scalar (@entry_keys); $i++) {
-            my $alpha = 0;
-            if ($entry_keys[$i] =~ /^[[:alpha:]]/) {
-              $alpha = 1;
-            }
-            push @keys_and_alpha, [$sort_entry_keys[$i], $alpha];
+      if ($non_empty_index_subentries > 0) {
+        my @keys_and_alpha;
+        for (my $i = 0; $i < scalar (@entry_sort_strings); $i++) {
+          my $alpha = 0;
+          if ($entry_sort_strings[$i] =~ /^[[:alpha:]]/) {
+            $alpha = 1;
           }
-          my $sortable_entry = {'entry' => $index_entry,
-                                'keys' => \@keys_and_alpha,
-                                'entry_keys' => \@entry_keys,
-                                'number' => $index_entry->{'entry_number'},
-                                'index_name' => $entry_index_name};
-          push @{$sortable_index_entries}, $sortable_entry;
-          last;
+          push @keys_and_alpha, [$entry_sort_keys[$i], $alpha];
         }
+        my $sortable_entry = {'entry' => $index_entry,
+                              'keys' => \@keys_and_alpha,
+                              'entry_keys' => \@entry_sort_strings,
+                              'number' => $index_entry->{'entry_number'},
+                              'index_name' => $entry_index_name};
+        push @{$sortable_index_entries}, $sortable_entry;
       }
       if ($in_code) {
         Texinfo::Convert::Text::reset_options_code($convert_text_options);
       }
-      $index_entries_sort_strings->{$index_entry} = join(', ', @entry_keys);
+      $index_entries_sort_strings->{$index_entry}
+         = join(', ', @entry_sort_strings);
     }
     $index_sortable_index_entries->{$index_name} = $sortable_index_entries;
   }
diff --git a/tp/Texinfo/XS/configure.ac b/tp/Texinfo/XS/configure.ac
index 9e271fda20..495d13a905 100644
--- a/tp/Texinfo/XS/configure.ac
+++ b/tp/Texinfo/XS/configure.ac
@@ -175,6 +175,8 @@ esac
 AC_SUBST([perl_conf_CFLAGS], [$perl_conf_CFLAGS])
 AC_SUBST([perl_conf_LDFLAGS], [$perl_conf_LDFLAGS])
 
+AC_CHECK_FUNCS(newlocale strxfrm_l)
+
 # Output these with the _CONFIG suffix as we use the originals as names
 # of customization variables.
 AC_DEFINE_UNQUOTED([PACKAGE_CONFIG], ["$PACKAGE"],
diff --git a/tp/Texinfo/XS/convert/ConvertXS.xs 
b/tp/Texinfo/XS/convert/ConvertXS.xs
index f3e152e685..750ed03878 100644
--- a/tp/Texinfo/XS/convert/ConvertXS.xs
+++ b/tp/Texinfo/XS/convert/ConvertXS.xs
@@ -1699,6 +1699,16 @@ html_merge_index_entries (SV *converter_in)
          if (self)
            html_merge_index_entries (self);
 
+void
+html_sort_index_entries (SV *converter_in)
+      PREINIT:
+         CONVERTER *self;
+     CODE:
+         self = get_sv_converter (converter_in,
+                                  "html_sort_index_entries");
+         if (self)
+           html_sort_index_entries (self);
+
 void
 reset_output_init_conf (SV *sv_in)
 
diff --git a/tp/Texinfo/XS/convert/convert_html.c 
b/tp/Texinfo/XS/convert/convert_html.c
index 60e29808b1..f9baefb3ee 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -4680,6 +4680,20 @@ html_merge_index_entries (CONVERTER *self)
     }
 }
 
+void
+html_sort_index_entries (CONVERTER *self)
+{
+  html_merge_index_entries (self);
+
+  if (self->document->index_names)
+    {
+      self->index_entries_by_letter
+        = sort_indices_by_letter (&self->error_messages, self->conf,
+                                  self->index_entries,
+                                  self->document->index_names);
+    }
+}
+
 int
 compare_index_name (const void *a, const void *b)
 {
diff --git a/tp/Texinfo/XS/convert/convert_html.h 
b/tp/Texinfo/XS/convert/convert_html.h
index 206acdd78b..663510c363 100644
--- a/tp/Texinfo/XS/convert/convert_html.h
+++ b/tp/Texinfo/XS/convert/convert_html.h
@@ -169,6 +169,7 @@ size_t html_check_htmlxref_already_warned (CONVERTER *self,
                                            const SOURCE_INFO *source_info);
 
 void html_merge_index_entries (CONVERTER *self);
+void html_sort_index_entries (CONVERTER *self);
 
 void html_prepare_conversion_units (CONVERTER *self,
                                     int *output_units_descriptor_ref,
diff --git a/tp/Texinfo/XS/convert/indices_in_conversion.c 
b/tp/Texinfo/XS/convert/indices_in_conversion.c
index 4a451d2f63..271d93daca 100644
--- a/tp/Texinfo/XS/convert/indices_in_conversion.c
+++ b/tp/Texinfo/XS/convert/indices_in_conversion.c
@@ -17,6 +17,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include <locale.h>
 #include "uniconv.h"
 #include "unictype.h"
 #include "unistr.h"
@@ -25,8 +26,11 @@
 #include "converter_types.h"
 #include "utils.h"
 #include "extra.h"
+#include "errors.h"
+#include "debug.h"
 #include "unicode.h"
 #include "convert_to_text.h"
+#include "convert_to_texinfo.h"
 #include "indices_in_conversion.h"
 
 /* corresponding perl code in Texinfo::Indices */
@@ -222,3 +226,569 @@ index_entry_element_sort_string (INDEX_ENTRY *main_entry,
     }
   return sort_string;
 }
+
+static void
+set_sort_key (locale_t collation_locale, const char *input_string,
+              char **result_key)
+{
+  if (collation_locale)
+    {
+  #ifdef HAVE_STRXFRM_L
+      size_t len = strxfrm_l (0, input_string, 0, collation_locale);
+      size_t check_len;
+
+      *result_key
+        = (char *) malloc ((len +1) * sizeof (char));
+      check_len = strxfrm_l (*result_key, input_string, len+1,
+                             collation_locale);
+      if (check_len != len)
+        fatal ("strxfrm_l returns a different length");
+  #endif
+    }
+  else
+    *result_key = strdup (input_string);
+}
+
+typedef struct INDEX_SORT_STRING_KEY {
+    char *sort_string;
+    char *sort_key;
+} INDEX_SORT_STRING_KEY;
+
+static INDEX_SORT_STRING_KEY *
+index_entry_element_sort_string_key (INDEX_ENTRY *main_entry,
+                                     ELEMENT *index_entry_element,
+                                     TEXT_OPTIONS *options, int in_code,
+                                     locale_t collation_locale,
+                                     int prefer_reference_element)
+{
+  INDEX_SORT_STRING_KEY *sort_string_key = (INDEX_SORT_STRING_KEY *)
+    malloc (sizeof (INDEX_SORT_STRING_KEY));
+  sort_string_key->sort_string = index_entry_element_sort_string (main_entry,
+                                     index_entry_element, options, in_code,
+                                     prefer_reference_element);
+  set_sort_key (collation_locale, sort_string_key->sort_string,
+                &sort_string_key->sort_key);
+
+  return sort_string_key;
+}
+
+INDICES_SORTABLE_ENTRIES *
+setup_sortable_index_entries (ERROR_MESSAGE_LIST *error_messages,
+                      OPTIONS *options, MERGED_INDEX *index_entries,
+                      INDEX **indices_information, locale_t *collation_locale)
+{
+  size_t indices_nr;
+  size_t i;
+  TEXT_OPTIONS *convert_text_options
+    = setup_index_entry_keys_formatting (options);
+
+  /* similar to using Unicode::Collate in Perl */
+  *collation_locale = 0;
+
+  #ifdef HAVE_STRXFRM_L
+  #ifdef HAVE_NEWLOCALE
+  if (!options->USE_UNICODE_COLLATION.integer == 0)
+    *collation_locale = newlocale (LC_COLLATE_MASK, "en_US.utf-8", 0);
+  #endif
+  #endif
+
+  for (indices_nr = 0; index_entries[indices_nr].name; indices_nr++)
+    {}
+
+  if (indices_nr == 0)
+    return 0;
+
+  INDICES_SORTABLE_ENTRIES *indices_sortable_entries
+    = (INDICES_SORTABLE_ENTRIES *) malloc (sizeof (INDICES_SORTABLE_ENTRIES));
+
+  indices_sortable_entries->number = indices_nr;
+  indices_sortable_entries->indices = (INDEX_SORTABLE_ENTRIES *)
+    malloc (indices_nr * sizeof (INDEX_SORTABLE_ENTRIES));
+  memset (indices_sortable_entries->indices, 0,
+          indices_nr * sizeof (INDEX_SORTABLE_ENTRIES));
+
+  for (i = 0; i < indices_nr; i++)
+    {
+      MERGED_INDEX *index = &index_entries[i];
+      INDEX_SORTABLE_ENTRIES *sortable_index_entries
+        = &indices_sortable_entries->indices[i];
+      if (index->entries_number > 0)
+        {
+          size_t j;
+
+          sortable_index_entries->index = index;
+          sortable_index_entries->number = index->entries_number;
+          sortable_index_entries->sortable_entries = (SORTABLE_INDEX_ENTRY *)
+            malloc (index->entries_number * sizeof (SORTABLE_INDEX_ENTRY));
+
+          for (j = 0; j < index->entries_number; j++)
+            {
+              int non_empty_index_subentries = 0;
+              SORTABLE_INDEX_SUBENTRY *sortable_subentry;
+              INDEX_ENTRY *index_entry = &index->index_entries[j];
+
+              ELEMENT *main_entry_element = index_entry->entry_element;
+              ELEMENT *subentry = main_entry_element;
+
+              INDEX *entry_index
+                = indices_info_index_by_name (indices_information,
+                                              index_entry->index_name);
+
+              INDEX_SORT_STRING_KEY *sort_string_key
+                = index_entry_element_sort_string_key (index_entry,
+                            main_entry_element, convert_text_options,
+                            entry_index->in_code, *collation_locale, 0);
+
+              SORTABLE_INDEX_ENTRY *sortable_entry
+                = &sortable_index_entries->sortable_entries[j];
+
+              sortable_entry->entry = index_entry;
+              sortable_entry->subentries_number = 1;
+              sortable_entry->sortable_subentries = (SORTABLE_INDEX_SUBENTRY *)
+                malloc (sizeof (SORTABLE_INDEX_SUBENTRY));
+
+              sortable_subentry = &sortable_entry->sortable_subentries[0];
+
+              if (sort_string_key->sort_string[strspn
+                   (sort_string_key->sort_string, whitespace_chars)] == '\0')
+                {
+                  const char *entry_cmdname;
+
+                  sortable_subentry->sort_string = strdup("");
+                  sortable_subentry->sort_key = strdup("");
+                  free (sort_string_key->sort_string);
+                  free (sort_string_key->sort_key);
+
+                  entry_cmdname = element_command_name (main_entry_element);
+                  if (!entry_cmdname)
+                    {
+                      entry_cmdname = lookup_extra_string (main_entry_element,
+                                                      "original_def_cmdname");
+                    }
+
+                  message_list_command_warn (error_messages, options,
+                                             main_entry_element,
+                                      "empty index key in @%s", entry_cmdname);
+                }
+              else
+                {
+                  sortable_subentry->sort_string
+                     = sort_string_key->sort_string;
+                  sortable_subentry->sort_key = sort_string_key->sort_key;
+                  non_empty_index_subentries++;
+                }
+              free (sort_string_key);
+
+              while (1)
+                {
+                  ELEMENT *next_subentry = lookup_extra_element (subentry,
+                                                               "subentry");
+                  if (!next_subentry)
+                    break;
+
+                  subentry = next_subentry;
+                  sortable_entry->subentries_number++;
+
+                  sortable_entry->sortable_subentries
+                   = (SORTABLE_INDEX_SUBENTRY *)
+                    realloc (sortable_entry->sortable_subentries,
+                        sizeof (SORTABLE_INDEX_SUBENTRY)
+                           * sortable_entry->subentries_number);
+                  if (!sortable_entry->sortable_subentries)
+                    fatal ("realloc failed");
+
+                  sortable_subentry
+                   = &sortable_entry->sortable_subentries[
+                          sortable_entry->subentries_number -1];
+
+                  sort_string_key
+                    = index_entry_element_sort_string_key (index_entry,
+                            subentry, convert_text_options,
+                            entry_index->in_code, *collation_locale, 0);
+
+                  if (sort_string_key->sort_string[strspn
+                     (sort_string_key->sort_string, whitespace_chars)] == '\0')
+                    {
+                      const char *entry_cmdname;
+
+                      sortable_subentry->sort_string = strdup("");
+                      sortable_subentry->sort_key = strdup("");
+                      free (sort_string_key->sort_string);
+                      free (sort_string_key->sort_key);
+
+                      entry_cmdname = element_command_name 
(main_entry_element);
+                      if (!entry_cmdname)
+                        {
+                          entry_cmdname
+                             = lookup_extra_string (main_entry_element,
+                                                 "original_def_cmdname");
+                        }
+
+                      message_list_command_warn (error_messages, options,
+                                             main_entry_element,
+                               "empty index sub entry %zu key in @%s",
+                               sortable_entry->subentries_number -1,
+                                entry_cmdname);
+                    }
+                  else
+                    {
+                      sortable_subentry->sort_string
+                         = sort_string_key->sort_string;
+                      sortable_subentry->sort_key = sort_string_key->sort_key;
+                      non_empty_index_subentries++;
+                    }
+                  free (sort_string_key);
+                }
+              if (non_empty_index_subentries > 0)
+                {
+                  int k;
+                  for (k = 0; k < sortable_entry->subentries_number; k++)
+                    {
+                      uint8_t *encoded_u8;
+                      ucs4_t next_char;
+                      int new_len;
+
+                      sortable_subentry
+                        = &sortable_entry->sortable_subentries[k];
+             /* TODO quite inefficient, only need the first character */
+                      encoded_u8
+                       = u8_strconv_from_encoding (
+                                         sortable_subentry->sort_string,
+                                         "UTF-8", iconveh_question_mark);
+                      new_len = u8_strmbtouc (&next_char, encoded_u8);
+                      if (new_len > 0
+                          && uc_is_property (next_char, 
UC_PROPERTY_ALPHABETIC))
+                        sortable_subentry->alpha = 1;
+                      else
+                        sortable_subentry->alpha = 0;
+
+                      free (encoded_u8);
+                    }
+                }
+            }
+        }
+    }
+  return indices_sortable_entries;
+}
+
+typedef struct LETTER_SORTABLE_ENTRIES {
+    char *letter;
+    char *letter_sort_key;
+    size_t space;
+    size_t number;
+    SORTABLE_INDEX_ENTRY **sortable_entries;
+} LETTER_SORTABLE_ENTRIES;
+
+typedef struct INDEX_LETTERS_SORTABLE_ENTRIES {
+    size_t letter_number;
+    size_t space;
+    LETTER_SORTABLE_ENTRIES *letter_entries;
+} INDEX_LETTERS_SORTABLE_ENTRIES;
+
+static int
+compare_index_letter (const void *a, const void *b)
+{
+  const LETTER_SORTABLE_ENTRIES *lse_a = (const LETTER_SORTABLE_ENTRIES *) a;
+  const LETTER_SORTABLE_ENTRIES *lse_b = (const LETTER_SORTABLE_ENTRIES *) b;
+
+  return strcmp (lse_a->letter_sort_key, lse_b->letter_sort_key);
+}
+
+static int
+compare_sortable_subentry_keys (const void *a, const void *b)
+{
+  const SORTABLE_INDEX_SUBENTRY *sis_a = (const SORTABLE_INDEX_SUBENTRY *) a;
+  const SORTABLE_INDEX_SUBENTRY *sis_b = (const SORTABLE_INDEX_SUBENTRY *) b;
+
+  if (sis_a->alpha == sis_b->alpha)
+    return strcmp (sis_a->sort_key, sis_b->sort_key);
+
+  return (sis_a->alpha > sis_b->alpha) - (sis_a->alpha < sis_b->alpha);
+}
+
+static int
+compare_sortable_index_entry (const void *a, const void *b)
+{
+  const SORTABLE_INDEX_ENTRY **psie_a = (const SORTABLE_INDEX_ENTRY **) a;
+  const SORTABLE_INDEX_ENTRY **psie_b = (const SORTABLE_INDEX_ENTRY **) b;
+  const SORTABLE_INDEX_ENTRY *sie_a = *psie_a;
+  const SORTABLE_INDEX_ENTRY *sie_b = *psie_b;
+
+  size_t i;
+  int res;
+
+  /* corresponds to the main entry and subentries */
+  for (i = 0; i < sie_a->subentries_number; i++)
+    {
+      SORTABLE_INDEX_SUBENTRY *sub_key_a = &sie_a->sortable_subentries[i];
+      SORTABLE_INDEX_SUBENTRY *sub_key_b = &sie_b->sortable_subentries[i];
+
+      res = compare_sortable_subentry_keys (sub_key_a, sub_key_b);
+
+      if (res != 0)
+        return res;
+
+      if (i + 1 >= sie_b->subentries_number)
+        break;
+    }
+  res = (sie_a->subentries_number > sie_b->subentries_number)
+           - (sie_a->subentries_number < sie_b->subentries_number);
+
+  if (res != 0)
+    return res;
+
+  res = (sie_a->entry->number > sie_b->entry->number)
+           - (sie_a->entry->number < sie_b->entry->number);
+
+  if (res != 0)
+    return res;
+
+  /* This may happen if 2 indices are merged as the number is per
+     index name. */
+
+  return strcmp (sie_a->entry->index_name, sie_b->entry->index_name);
+}
+
+INDEX_SORTED_BY_LETTER *
+sort_indices_by_letter (ERROR_MESSAGE_LIST *error_messages,
+                        OPTIONS *options, MERGED_INDEX *index_entries,
+                              INDEX **indices_information)
+{
+  size_t i;
+  int index_nr = 0;
+  locale_t collation_locale = 0;
+  static INDEX_LETTERS_SORTABLE_ENTRIES index_letters_sortable_entries;
+
+  INDICES_SORTABLE_ENTRIES *indices_sortable_entries
+    = setup_sortable_index_entries (error_messages, options, index_entries,
+                                    indices_information, &collation_locale);
+
+  if (!indices_sortable_entries || indices_sortable_entries->number <= 0)
+    return 0;
+
+  INDEX_SORTED_BY_LETTER *sorted_index_entries
+   = (INDEX_SORTED_BY_LETTER *) malloc
+    ((indices_sortable_entries->number +1) * sizeof (INDEX_SORTED_BY_LETTER));
+
+  for (i = 0; i < indices_sortable_entries->number; i++)
+    {
+      size_t j;
+      INDEX_SORTABLE_ENTRIES *sortable_index_entries
+        = &indices_sortable_entries->indices[i];
+      INDEX_SORTED_BY_LETTER *index_sorted;
+
+      if (sortable_index_entries->number <= 0)
+        continue;
+
+      index_sorted = &sorted_index_entries[index_nr];
+      index_sorted->name = strdup (sortable_index_entries->index->name);
+
+      for (j = 0; j < sortable_index_entries->number; j++)
+        {
+          int len = 0;
+          SORTABLE_INDEX_ENTRY *sortable_entry
+            = &sortable_index_entries->sortable_entries[j];
+          char *sort_string
+            = sortable_entry->sortable_subentries[0].sort_string;
+          uint8_t *encoded_u8 = u8_strconv_from_encoding (sort_string, "UTF-8",
+                                                  iconveh_question_mark);
+          uint8_t *current_u8 = encoded_u8;
+          char *letter_string;
+          char *upper_letter_string;
+          char *norm_letter_string;
+          TEXT letter_text;
+          char *letter_sort_key;
+          int letter_added = 0;
+
+          LETTER_SORTABLE_ENTRIES *letter_sortable_entries = 0;
+
+          while (1)
+            {
+              ucs4_t next_char;
+              int new_len = u8_strmbtouc (&next_char, current_u8);
+              if (!new_len)
+                break;
+
+              len += new_len;
+              /* UC_NON_SPACING_MARK is the same as UC_CATEGORY_Mn */
+              if (uc_is_general_category (next_char, UC_NON_SPACING_MARK))
+                {
+                  current_u8 += new_len;
+                }
+              else
+                {
+                  break;
+                }
+            }
+          free (encoded_u8);
+
+          letter_string = strndup (sort_string, len);
+          upper_letter_string = to_upper_or_lower_multibyte (letter_string, 1);
+          free (letter_string);
+          norm_letter_string = normalize_NFKD (upper_letter_string);
+          free (upper_letter_string);
+          encoded_u8 = u8_strconv_from_encoding (norm_letter_string, "UTF-8",
+                                                  iconveh_question_mark);
+          current_u8 = encoded_u8;
+
+          text_init (&letter_text);
+          /* we want an empty string if the input string is empty */
+          text_append (&letter_text, "");
+
+          while (1)
+            {
+              ucs4_t next_char;
+              int new_len = u8_strmbtouc (&next_char, current_u8);
+              if (!new_len)
+                break;
+
+              if (!uc_is_general_category (next_char, UC_NON_SPACING_MARK))
+                {
+                  char *first_char_text;
+                  uint8_t *first_char_u8 = malloc (7 * sizeof(uint8_t));
+                  int first_char_len = u8_uctomb (first_char_u8, next_char, 6);
+                  if (first_char_len < 0)
+                    fatal ("u8_uctomb returns negative value");
+                  first_char_u8[first_char_len] = 0;
+                  first_char_text = u8_strconv_to_encoding (first_char_u8,
+                                               "UTF-8", iconveh_question_mark);
+                  free (first_char_u8);
+                  text_append (&letter_text, first_char_text);
+                  free (first_char_text);
+                }
+              current_u8 += new_len;
+            }
+          free (encoded_u8);
+
+          set_sort_key (collation_locale, letter_text.text,
+                        &letter_sort_key);
+
+          if (index_letters_sortable_entries.letter_number > 0)
+            {
+              /* used to find an existing letter */
+              static LETTER_SORTABLE_ENTRIES compared_letter_sortable;
+
+              compared_letter_sortable.letter_sort_key = letter_sort_key;
+
+              letter_sortable_entries = (LETTER_SORTABLE_ENTRIES *)
+                bsearch (&compared_letter_sortable,
+                     index_letters_sortable_entries.letter_entries,
+                     index_letters_sortable_entries.letter_number,
+                     sizeof (LETTER_SORTABLE_ENTRIES),
+                     compare_index_letter);
+            }
+
+          if (!letter_sortable_entries)
+            {
+              if (index_letters_sortable_entries.letter_number
+                   >= index_letters_sortable_entries.space)
+                {
+                  index_letters_sortable_entries.letter_entries
+                    = realloc (index_letters_sortable_entries.letter_entries,
+                          (index_letters_sortable_entries.space += 5)
+                            * sizeof (LETTER_SORTABLE_ENTRIES));
+                  if (!index_letters_sortable_entries.letter_entries)
+                    fatal ("realloc failed");
+                }
+              letter_sortable_entries
+                = &index_letters_sortable_entries.letter_entries[
+                              index_letters_sortable_entries.letter_number];
+
+              letter_sortable_entries->space = 0;
+              letter_sortable_entries->number = 0;
+              letter_sortable_entries->sortable_entries = 0;
+              letter_sortable_entries->letter = letter_text.text;
+              letter_sortable_entries->letter_sort_key = letter_sort_key;
+
+              index_letters_sortable_entries.letter_number++;
+
+              letter_added = 1;
+            }
+          else
+            {
+              free (letter_text.text);
+              free (letter_sort_key);
+            }
+
+          /* now add the sortable entry to the letter sortable entries */
+          if (letter_sortable_entries->number
+                  >= letter_sortable_entries->space)
+            {
+              letter_sortable_entries->sortable_entries
+               = realloc (letter_sortable_entries->sortable_entries,
+                   (letter_sortable_entries->space += 5)
+                            * sizeof (SORTABLE_INDEX_ENTRY));
+                  if (!letter_sortable_entries->sortable_entries)
+                    fatal ("realloc failed");
+            }
+          letter_sortable_entries->sortable_entries[
+                        letter_sortable_entries->number] = sortable_entry;
+
+          letter_sortable_entries->number++;
+          /* sort after using letter_sortable_entries, as its address may be
+             changed by the sort */
+          if (index_letters_sortable_entries.letter_number > 1
+              && letter_added)
+            {
+              /* re-sort letters with the new letter added */
+              if (index_letters_sortable_entries.letter_number > 1)
+                qsort (index_letters_sortable_entries.letter_entries,
+                       index_letters_sortable_entries.letter_number,
+                       sizeof (LETTER_SORTABLE_ENTRIES),
+                       compare_index_letter);
+            }
+        }
+
+      /* sort the index entries by letter */
+      /* also reset for the next index */
+
+      if (index_letters_sortable_entries.letter_number > 0)
+        {
+          index_sorted->letter_number
+            = index_letters_sortable_entries.letter_number;
+          index_sorted->letter_entries = (LETTER_INDEX_ENTRIES *) malloc
+            (index_sorted->letter_number * sizeof (LETTER_INDEX_ENTRIES));
+
+          for (j = 0; j < index_letters_sortable_entries.letter_number; j++)
+            {
+              size_t k;
+              LETTER_SORTABLE_ENTRIES *letter_sortable_entries
+               = &index_letters_sortable_entries.letter_entries[j];
+              LETTER_INDEX_ENTRIES *letter_index_entries
+               = &index_sorted->letter_entries[j];
+
+              qsort (letter_sortable_entries->sortable_entries,
+                   letter_sortable_entries->number,
+                   sizeof (SORTABLE_INDEX_ENTRY *),
+                   compare_sortable_index_entry);
+
+              letter_index_entries->letter = letter_sortable_entries->letter;
+              letter_index_entries->entries = (INDEX_ENTRY **) malloc
+                (letter_sortable_entries->number * sizeof (INDEX_ENTRY *));
+              letter_index_entries->entries_number
+                   = letter_sortable_entries->number;
+
+              /* set index entries in the sorted order */
+              for (k = 0; k < letter_sortable_entries->number; k++)
+                {
+                  letter_index_entries->entries[k]
+                    = letter_sortable_entries->sortable_entries[k]->entry;
+                }
+
+              /* TODO should we reuse the letters instead? */
+              free (letter_sortable_entries->letter_sort_key);
+              free (letter_sortable_entries->sortable_entries);
+            }
+          /* TODO should we reuse the letters instead? */
+          index_letters_sortable_entries.letter_number = 0;
+        }
+      index_nr++;
+    }
+
+  memset (&sorted_index_entries[index_nr], 0, sizeof (INDEX_SORTED_BY_LETTER));
+  if (index_nr < indices_sortable_entries->number)
+    sorted_index_entries = realloc (sorted_index_entries,
+                     (index_nr+1) * sizeof (INDEX_SORTED_BY_LETTER));
+
+  return sorted_index_entries;
+}
diff --git a/tp/Texinfo/XS/convert/indices_in_conversion.h 
b/tp/Texinfo/XS/convert/indices_in_conversion.h
index 71787d165d..f69142c7cf 100644
--- a/tp/Texinfo/XS/convert/indices_in_conversion.h
+++ b/tp/Texinfo/XS/convert/indices_in_conversion.h
@@ -5,6 +5,32 @@
 #include "tree_types.h"
 #include "convert_to_text.h"
 
+typedef struct SORTABLE_INDEX_SUBENTRY {
+    char *sort_string;
+    char *sort_key;
+    int alpha;
+} SORTABLE_INDEX_SUBENTRY;
+
+typedef struct SORTABLE_INDEX_ENTRY {
+    INDEX_ENTRY *entry;
+    /* in perl 'index_name' => $index_entry->{'index_name'} */
+    /* in perl 'number' => $index_entry->{'entry_number'} */
+    size_t subentries_number;
+    /* both entry_keys and keys in perl */
+    SORTABLE_INDEX_SUBENTRY *sortable_subentries;
+} SORTABLE_INDEX_ENTRY;
+
+typedef struct INDEX_SORTABLE_ENTRIES {
+    MERGED_INDEX *index;
+    size_t number;
+    SORTABLE_INDEX_ENTRY *sortable_entries;
+} INDEX_SORTABLE_ENTRIES;
+
+typedef struct INDICES_SORTABLE_ENTRIES {
+    size_t number;
+    INDEX_SORTABLE_ENTRIES *indices;
+} INDICES_SORTABLE_ENTRIES;
+
 MERGED_INDEX *merge_indices (INDEX **index_names);
 void destroy_merged_indices (MERGED_INDEX *merged_indices);
 
@@ -18,4 +44,9 @@ char *index_entry_element_sort_string (INDEX_ENTRY 
*main_entry,
                                  ELEMENT *index_entry_element,
                                  TEXT_OPTIONS *options, int in_code,
                                  int prefer_reference_element);
+
+INDEX_SORTED_BY_LETTER *sort_indices_by_letter (
+                        ERROR_MESSAGE_LIST *error_messages,
+                        OPTIONS *options, MERGED_INDEX *index_entries,
+                              INDEX **indices_information);
 #endif
diff --git a/tp/Texinfo/XS/main/tree_types.h b/tp/Texinfo/XS/main/tree_types.h
index 9cef300923..fb6a56fb4c 100644
--- a/tp/Texinfo/XS/main/tree_types.h
+++ b/tp/Texinfo/XS/main/tree_types.h
@@ -237,8 +237,10 @@ typedef struct IGNORED_CHARS {
     int atsign;
 } IGNORED_CHARS;
 
-typedef struct {
+typedef struct INDEX_ENTRY {
     char *index_name; /* kept with the entry as the indices may be merged */
+    int number; /* position in the original index.  May be different in
+                   merged index */
     ELEMENT *entry_element;
     ELEMENT *entry_associated_element; /* set if the entry is reassociated to
                                           another element */
diff --git a/tp/Texinfo/XS/main/utils.c b/tp/Texinfo/XS/main/utils.c
index 39e6825788..70813e2543 100644
--- a/tp/Texinfo/XS/main/utils.c
+++ b/tp/Texinfo/XS/main/utils.c
@@ -262,6 +262,10 @@ word_bytes_len_multibyte (const char *text)
         {
           break;
         }
+      /* Nd: Number, decimal digit
+         M: Mark
+         Pc: Punctuation, connector
+       */
       /* (\p{Alnum} = \p{Alphabetic} + \p{Nd}) + \pM + \p{Pc}
                                               + \p{Join_Control} */
       if (uc_is_general_category (next_char, UC_CATEGORY_M)
diff --git a/tp/Texinfo/XS/parsetexi/indices.c 
b/tp/Texinfo/XS/parsetexi/indices.c
index 25319fcf06..49535401ff 100644
--- a/tp/Texinfo/XS/parsetexi/indices.c
+++ b/tp/Texinfo/XS/parsetexi/indices.c
@@ -264,9 +264,9 @@ enter_index_entry (enum command_id index_type_cmd,
   memset (entry, 0, sizeof (INDEX_ENTRY));
 
   entry->index_name = idx->name;
-  /* not needed, the position in the index is directly used
+  /* not needed in the parser, the position in the index is directly used.
+     Used for sorting */
   entry->number = idx->entries_number;
-  */
   entry->entry_element = element;
 
   /* Create ignored_chars string. */



reply via email to

[Prev in Thread] Current Thread [Next in Thread]