texinfo-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[no subject]


From: Patrice Dumas
Date: Thu, 16 May 2024 16:59:04 -0400 (EDT)

branch: master
commit 1f70318042707958c2f7fdc036f1422795201bfe
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Thu May 16 21:54:18 2024 +0200

    * tp/Texinfo/ParserNonXS.pm (%parser_state_initialization)
    (%parser_inner_configuration, %parser_state_configuration)
    (parser, _initialize_parsing, parse_texi_piece, parse_texi_text)
    (get_parser_info, _input_push_text, parse_texi_file, parse_texi_line)
    (_setup_conf):
    put input, some elements of global_info, definfoenclose,
    source_mark_counters, nesting_context, context_stack and
    context_command_stack in %parser_state_initialization.  Classify the
    parsing state keys.  Distinguish parser configuration initialization
    and parser parsing state initialization.  Add _initialize_parsing for
    parser parsing initialization.  Call it in parse_texi_* instead of in
    parser().  Split %parser_inner_configuration out of
    %parser_state_configuration.  Initialize $parser->{'conf'} in
    _setup_conf.
---
 ChangeLog                 |  17 +++
 tp/TODO                   |   8 +-
 tp/Texinfo/ParserNonXS.pm | 274 +++++++++++++++++++++++++---------------------
 3 files changed, 172 insertions(+), 127 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4bf255ef70..b3d2568122 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2024-05-16  Patrice Dumas  <pertusus@free.fr>
+
+       * tp/Texinfo/ParserNonXS.pm (%parser_state_initialization)
+       (%parser_inner_configuration, %parser_state_configuration)
+       (parser, _initialize_parsing, parse_texi_piece, parse_texi_text)
+       (get_parser_info, _input_push_text, parse_texi_file, parse_texi_line)
+       (_setup_conf):
+       put input, some elements of global_info, definfoenclose,
+       source_mark_counters, nesting_context, context_stack and
+       context_command_stack in %parser_state_initialization.  Classify the
+       parsing state keys.  Distinguish parser configuration initialization
+       and parser parsing state initialization.  Add _initialize_parsing for
+       parser parsing initialization.  Call it in parse_texi_* instead of in
+       parser().  Split %parser_inner_configuration out of
+       %parser_state_configuration.  Initialize $parser->{'conf'} in
+       _setup_conf.
+
 2024-05-15  Patrice Dumas  <pertusus@free.fr>
 
        * tp/Texinfo/ParserNonXS.pm (parser), tp/Texinfo/Translations.pm
diff --git a/tp/TODO b/tp/TODO
index 11ba6097b9..1e9e830e70 100644
--- a/tp/TODO
+++ b/tp/TODO
@@ -112,6 +112,8 @@ $current->{'extra'}->{'following_punctuation'} or similar.
 Using callgrind to find the time used by functions
 
 valgrind --tool=callgrind perl -w texi2any.pl ../doc/texinfo.texi --html
+# to avoid cycles (some remain in Perl only code) that mess up the graph:
+valgrind --tool=callgrind --separate-callers=3 --separate-recs=10 perl -w 
texi2any.pl ../doc/texinfo.texi --html
 kcachegrind callgrind.out.XXXXXX
 
 C code could be checked to see if using an hash map implementation,
@@ -121,9 +123,9 @@ Could be interesting for find_string:
  unique_target -> find_string
  (and, though much less used output_files_open_out -> find_string)
 
-Another possibility for optimization would be to call Perl less, as it
-still uses about 40% of the time (for html) even though it should mainly be
-for code called once (and not for Perl functions called from C).
+For the Texinfo manual with full XS, Perl uses 22% of the time (for html),
+now only for code hopefully called once.  Calling Perl getSortKey uses about
+28% (more on sorting and C below).
 
 
 hyphenation: should only appear in toplevel.
diff --git a/tp/Texinfo/ParserNonXS.pm b/tp/Texinfo/ParserNonXS.pm
index 66c1f381bf..a16b66a579 100644
--- a/tp/Texinfo/ParserNonXS.pm
+++ b/tp/Texinfo/ParserNonXS.pm
@@ -111,21 +111,16 @@ sub import {
 our $VERSION = '7.1dev';
 
 
-# these are the default values for the parser state
+# these are the default values for the parsing state
 # some could become configurable if moved to the next hash, but they
 # are not configurable/implemented in the XS parser, so they are best
 # left internal.  In general they are dynamically modified during parsing.
 my %parser_state_initialization = (
-  'aliases' => {},            # key is a command name value is the alias
+   # parsed document information registered in output document
   'commands_info' => {},      # keys are @-commands names (without @) and
                               # values are arrays for global multiple
                               # @-commands and a value for non multiple
                               # global @-commands.
-  'conditional_stack' => [],  # a stack of conditional commands that are
-                              # expanded.
-  'clickstyle' => 'arrow',       #
-  'kbdinputstyle' => 'distinct', #
-  'raw_block_stack' => [],    # a stack of raw block commands that are nested.
   'floats' => {},             # key is the normalized float type, value is
                               # an array reference holding all the floats
                               # of that type.
@@ -136,34 +131,90 @@ my %parser_state_initialization = (
                                # commands like @ref without books or external
                                # manual files, and menu entries without
                                # external manual.
+  'labels_list' => [],        # array of elements associated with labels.
+                              # information on document
+  'global_info' => {'input_perl_encoding' => 'utf-8',
+                    'input_encoding_name' => 'utf-8',
+                    'included_files' => [],},
+
+  # parsed document parsing information still relevant after parsing
+  'aliases' => {},            # key is a command name value is the alias
   'macros' => {},             # the key is the user-defined macro name.  The
                               # value is the reference on a macro element
                               # as obtained by parsing the @macro
+  'definfoenclose' => {},     # key is the command name, value is an array
+                              # reference with 2 values, beginning and ending.
+
+  # parsing information still relevant at the end of the parsing
+  'clickstyle' => 'arrow',       #
+  'kbdinputstyle' => 'distinct', #
+  'source_mark_counters' => {},  #
+  'current_node'    => undef,    # last seen node.
+  'current_section' => undef,    # last seen section.
+
+  # parsing information only relevant during parsing
+  'input' => [],       # a stack, with last at bottom.  Holds the opened files
+                       # or text.  Pending macro expansion or text expansion
+                       # is also in that structure.
+  'conditional_stack' => [],  # a stack of conditional commands that are
+                              # expanded.
+  'raw_block_stack' => [],    # a stack of raw block commands that are nested.
   'macro_expansion_nr' => 0,  # number of macros being expanded
   'value_expansion_nr' => 0,  # number of values being expanded
   'sections_level_modifier' => 0, # modified by raise/lowersections
-  'labels_list' => [],            # array of elements associated with labels
+  'nesting_context'    => {
+                         # key is the context name, value is the
+                         # depth of the context.
+                           'basic_inline_stack' => [],
+                           'basic_inline_stack_on_line' => [],
+                           'basic_inline_stack_block' => [],
+                           'regions_stack' => [],
+                           'footnote' => 0,
+                           'caption' => 0,
+                          },
+  'context_stack'      => ['_root'],
+                         # stack of the contexts, more recent on top.
+                         # 'ct_line' is added when on a line or
+                         # block @-command line,
+                         # 'ct_def' is added instead if on a definition line.
+                         # 'ct_preformatted' is added in block commands
+                         # where there is no paragraphs and spaces are kept
+                         # (format, example, display and menu commands...)
+                         # 'ct_math' is added in math block commands
+                         # (displaymath) and @math brace commands
+                         # 'ct_rawpreformatted' is added in raw block commands
+                         # (html, xml, docbook...)
+                         # 'ct_inlineraw' is added when in inlineraw
+                         # 'ct_brace_command' is added when in footnote,
+                         # caption, or shortcaption (context brace_commands
+                         # that does not already start another context, ie not
+                         # math).
+  'context_command_stack' => [''],
+                         # the stack of @-commands. An @-command name can
+                         # be added each time a context is pushed on
+                         # 'context_stack'.  Could be undef if there
+                         # is no @-command associated with the context.
   'input_file_encoding' => 'utf-8', # perl encoding name used for the input
                                     # file
   'input_encoding_name' => 'utf-8', # current input encoding name, based on
                                     # mime type encoding names
-  # initialization of information returned by global_information()
-  'global_info' => {},
-  # for get_conf, set for all the configuration keys that are also in
-  # %Texinfo::Common::default_parser_customization_values to the
-  # values set at parser initialization
-  'conf' => {},
 );
 
-# configurable parser state
-my %parser_state_configuration = (
+# Set when initializing a parser, but never from command-line/init files
+my %parser_inner_configuration = (
   'accept_internalvalue' => 0, # whether @txiinternalvalue should be added
                                # to the tree or considered invalid.
                                # currently set if called by gdt.
   'restricted' => 0,           # cannot define new commands or make index
                                # entries.  currently set when called from gdt.
+)
+
+# configurable parser state
+my %parser_state_configuration = (
   'registrar' => undef,        # Texinfo::Report object used for error
                                # reporting.
+
+  # parsed document parsing information still relevant after parsing
   'values' => {'txicommandconditionals' => 1},
                               # the key is the name, the value the @set name
                               # argument.
@@ -176,67 +227,45 @@ my %parser_state_configuration = (
 # customization options informations is gathered here, and also
 # because it is used in other codes, in particular the XS parser.
 my %parser_settable_configuration = (
+  %parser_inner_configuration,
   %parser_state_configuration,
   %Texinfo::Common::default_parser_customization_values,
 );
 
-my %parser_default_configuration = (
-  %parser_state_initialization,
-  %parser_settable_configuration
-);
-
-# the other possible keys for the parser state are:
-#
-# expanded_formats_hash   each key comes from EXPANDED_FORMATS, value is 1
+# The other possible keys for the parser state are initialized based
+# on customization variables.
+# parsed document information registered in output document
 # index_names             a structure holding the link between index
 #                         names and merged indices;
 #                         initial value is %index_names in Texinfo::Commands.
-# context_stack           stack of the contexts, more recent on top.
-#                         'ct_line' is added when on a line or
-#                         block @-command line,
-#                         'ct_def' is added instead if on a definition line.
-#                         'ct_preformatted' is added in block commands
-#                         where there is no paragraphs and spaces are kept
-#                         (format, example, display and menu commands...)
-#                         'ct_math' is added in math block commands
-#                         (displaymath) and @math brace commands
-#                         'ct_rawpreformatted' is added in raw block commands
-#                         (html, xml, docbook...)
-#                         'ct_inlineraw' is added when in inlineraw
-#                         'ct_brace_command' is added when in footnote,
-#                         caption, or shortcaption (context brace_commands
-#                         that does not already start another context, ie not
-#                         math).
-# context_command_stack   the stack of @-commands. An @-command name can
-#                         be added each time a context is pushed on
-#                         'context_stack'.  Could be undef if there
-#                         is no @-command associated with the context.
-# definfoenclose          an hash, key is the command name, value is an array
-#                         reference with 2 values, beginning and ending.
-# nesting_context         an hash, key is the context name, value is the
-#                         depth of the context.
-# input                   a stack, with last at bottom.  Holds the opened files
-#                         or text.  Pending macro expansion or text expansion
-#                         is also in that structure.
-# line_commands           the same as %line_commands in Texinfo::Common,
-#                         but with index entry commands dynamically added
+
+# parsing information still relevant at the end of the parsing
+# line_commands           the same as %line_commands, but with index entry
+#                         commands dynamically added.
+# brace_commands          the same as %brace_commands, but with definfoenclose
+#                         commands dynamically added.
+# valid_nestings          direct command valid nesting information, with
+#                         index entry commands dynamically added.
 # close_paragraph_commands      same as %close_paragraph_commands, with
 #                               commands dynamically added (no command added
 #                               in 2024).
-# close_preformatted_commands   same as %close_preformatted_commands
-# no_paragraph_commands   the same as %no_paragraph_commands below,
+# close_preformatted_commands   same as %close_preformatted_commands, with
+#                               commands dynamically added (no command added
+#                               in 2024).
+# no_paragraph_commands   the same as %no_paragraph_commands,
 #                         with new index entry commands dynamically added.
 # basic_inline_commands   the same as %contain_basic_inline_commands below, but
 #                         with new index entry commands dynamically added
-# current_node            last seen node.
-# current_section         last seen section.
 # command_index           associate a command name with an index name.
 # index_entry_commands    index entry commands, including added index commands.
-# internal_references     an array holding all the internal references.
 
+
+# parser keys related to customization
+# expanded_formats_hash   each key comes from EXPANDED_FORMATS, value is 1
 # set                     points to the value set when initializing, for
 #                         configuration items that are not to be overriden
 #                         by @-commands.  For example documentlanguage.
+# conf                    For get_conf
 
 
 # A source information is an hash reference with the keys:
@@ -553,41 +582,58 @@ foreach my $no_paragraph_context ('math', 'preformatted', 
'rawpreformatted',
 
 
 
-my %nesting_context_init = (
-                         'footnote' => 0,
-                         'caption' => 0,
-);
-
 # Interface and internal functions for input management
 
 # initialization entry point.  Set up a parser.
 # The last argument, optional, is a hash provided by the user to change
-# the default values for what is present in %parser_default_configuration.
+# the default values for what is present in %parser_settable_configuration.
 sub parser(;$$)
 {
   my $conf = shift;
 
-  my $parser = dclone(\%parser_default_configuration);
+  my $parser = dclone(\%parser_settable_configuration);
   bless $parser;
 
   _setup_conf($parser, $conf);
+
   # This is not very useful in perl, but mimics the XS parser
   print STDERR "!!!!!!!!!!!!!!!! RESETTING THE PARSER !!!!!!!!!!!!!!!!!!!!!\n"
     if ($parser->{'DEBUG'});
 
-  if (!$self->{'restricted'}) {
+  # turn the array to a hash for speed.  Not sure it really matters for such
+  # a small array.
+  $parser->{'expanded_formats_hash'} = {};
+  foreach my $expanded_format(@{$parser->{'EXPANDED_FORMATS'}}) {
+    $parser->{'expanded_formats_hash'}->{$expanded_format} = 1;
+  }
+
+  if (not defined($parser->{'registrar'})) {
+    $parser->{'registrar'} = Texinfo::Report::new();
+  }
+
+  return $parser;
+}
+
+sub _initialize_parsing()
+{
+  my $parser = shift;
+
+  my $parser_state = dclone(\%parser_state_initialization);
+
+  if (!$parser->{'restricted'}) {
     # Initialize command hash that are dynamically modified, notably
     # those for index commands, and definoenclose, based on defaults
-    $parser->{'line_commands'} = dclone(\%line_commands);
-    $parser->{'brace_commands'} = dclone(\%brace_commands);
-    $parser->{'valid_nestings'} = dclone(\%default_valid_nestings);
-    $parser->{'no_paragraph_commands'} = {%no_paragraph_commands};
-    $parser->{'index_names'} = dclone(\%index_names);
-    $parser->{'command_index'} = {%command_index};
-    $parser->{'index_entry_commands'} = {%index_entry_command_commands};
-    $parser->{'close_paragraph_commands'} = {%close_paragraph_commands};
-    $parser->{'close_preformatted_commands'} = {%close_preformatted_commands};
-    $parser->{'basic_inline_commands'} = {%contain_basic_inline_commands};
+    $parser_state->{'line_commands'} = dclone(\%line_commands);
+    $parser_state->{'brace_commands'} = dclone(\%brace_commands);
+    $parser_state->{'valid_nestings'} = dclone(\%default_valid_nestings);
+    $parser_state->{'no_paragraph_commands'} = {%no_paragraph_commands};
+    $parser_state->{'index_names'} = dclone(\%index_names);
+    $parser_state->{'command_index'} = {%command_index};
+    $parser_state->{'index_entry_commands'} = {%index_entry_command_commands};
+    $parser_state->{'close_paragraph_commands'} = {%close_paragraph_commands};
+    $parser_state->{'close_preformatted_commands'}
+                                        = {%close_preformatted_commands};
+    $parser_state->{'basic_inline_commands'} = 
{%contain_basic_inline_commands};
   } else {
     # in a restricted parser, new commands are not defined (no user-defined
     # macros, alias, no new index commands), and index entries are not set.
@@ -595,43 +641,22 @@ sub parser(;$$)
     # indices information is not needed at all.  It is used in gdt() and this
     # has a sizable effect on performance.
 
-    $parser->{'line_commands'} = \%line_commands;
-    $parser->{'brace_commands'} = \%brace_commands;
-    $parser->{'valid_nestings'} = \%default_valid_nestings;
-    $parser->{'no_paragraph_commands'} = \%no_paragraph_commands;
+    $parser_state->{'line_commands'} = \%line_commands;
+    $parser_state->{'brace_commands'} = \%brace_commands;
+    $parser_state->{'valid_nestings'} = \%default_valid_nestings;
+    $parser_state->{'no_paragraph_commands'} = \%no_paragraph_commands;
     # not needed, but not undef because it is exported to document
-    $parser->{'index_names'} = {};
+    $parser_state->{'index_names'} = {};
     # not needed
-    #$parser->{'command_index'} = {};
-    $parser->{'index_entry_commands'} = \%index_entry_command_commands;
-    $parser->{'close_paragraph_commands'} = \%close_paragraph_commands;
-    $parser->{'close_preformatted_commands'} = \%close_preformatted_commands;
-    $parser->{'basic_inline_commands'} = \%contain_basic_inline_commands;
-  }
-
-  # other initializations
-  $parser->{'definfoenclose'} = {};
-  $parser->{'source_mark_counters'} = {};
-  $parser->{'nesting_context'} = {%nesting_context_init};
-  $parser->{'nesting_context'}->{'basic_inline_stack'} = [];
-  $parser->{'nesting_context'}->{'basic_inline_stack_on_line'} = [];
-  $parser->{'nesting_context'}->{'basic_inline_stack_block'} = [];
-  $parser->{'nesting_context'}->{'regions_stack'} = [];
-
-  $parser->_init_context_stack();
-
-  # turn the array to a hash for speed.  Not sure it really matters for such
-  # a small array.
-  $parser->{'expanded_formats_hash'} = {};
-  foreach my $expanded_format(@{$parser->{'EXPANDED_FORMATS'}}) {
-    $parser->{'expanded_formats_hash'}->{$expanded_format} = 1;
-  }
-
-  if (not defined($parser->{'registrar'})) {
-    $parser->{'registrar'} = Texinfo::Report::new();
+    #$parser_state->{'command_index'} = {};
+    $parser_state->{'index_entry_commands'} = \%index_entry_command_commands;
+    $parser_state->{'close_paragraph_commands'} = \%close_paragraph_commands;
+    $parser_state->{'close_preformatted_commands'}
+         = \%close_preformatted_commands;
+    $parser_state->{'basic_inline_commands'} = \%contain_basic_inline_commands;
   }
 
-  return $parser;
+  return $parser_state;
 }
 
 sub get_conf($$)
@@ -671,9 +696,6 @@ sub _input_push_text($$$;$$)
 {
   my ($self, $text, $line_nr, $macro_name, $value_name) = @_;
 
-  if (not $self->{'input'}) {
-    $self->{'input'} = [];
-  }
   my $input_source_info = {'line_nr' => $line_nr};
   if (scalar(@{$self->{'input'}})) {
     if (exists($self->{'input'}->[0]->{'input_source_info'}->{'file_name'})) {
@@ -726,6 +748,9 @@ sub parse_texi_piece($$;$)
 
   $line_nr = 1 if (not defined($line_nr));
 
+  my $parser_state = $self->_initialize_parsing();
+  %$self = (%$self, %$parser_state);
+
   _input_push_text($self, $text, $line_nr);
 
   my ($document_root, $before_node_section)
@@ -745,6 +770,9 @@ sub parse_texi_line($$;$)
 
   $line_nr = 1 if (not defined($line_nr));
 
+  my $parser_state = $self->_initialize_parsing();
+  %$self = (%$self, %$parser_state);
+
   _input_push_text($self, $text, $line_nr);
 
   my $root = {'type' => 'root_line'};
@@ -761,6 +789,9 @@ sub parse_texi_text($$;$)
 
   $line_nr = 1 if (not defined($line_nr));
 
+  my $parser_state = $self->_initialize_parsing();
+  %$self = (%$self, %$parser_state);
+
   _input_push_text($self, $text, $line_nr);
 
   my $document = $self->_parse_texi_document();
@@ -812,7 +843,6 @@ sub _input_push_file
 
   $file_input->{'file_name_encoding'} = $file_name_encoding
        if (defined($file_name_encoding));
-  $self->{'input'} = [] if (!defined($self->{'input'}));
   unshift @{$self->{'input'}}, $file_input;
 
   return 1, $file_name, $directories, undef;
@@ -827,14 +857,10 @@ sub get_parser_info($)
                                          $self->{'registrar'}, $self);
   if (defined($perl_encoding)) {
     $self->{'global_info'}->{'input_perl_encoding'} = $perl_encoding
-  } else {
-    $self->{'global_info'}->{'input_perl_encoding'} = 'utf-8';
   }
   if (defined($self->{'input_encoding_name'})) {
     $self->{'global_info'}->{'input_encoding_name'}
                                = $self->{'input_encoding_name'};
-  } else {
-    $self->{'global_info'}->{'input_encoding_name'} = 'utf-8';
   }
 
   my $global_commands = $self->{'commands_info'};
@@ -869,6 +895,9 @@ sub parse_texi_file($$)
 
   return undef if (!defined($self));
 
+  my $parser_state = $self->_initialize_parsing();
+  %$self = (%$self, %$parser_state);
+
   my ($status, $file_name, $directories, $error_message)
     = _input_push_file($self, $input_file_path);
   if (!$status) {
@@ -953,6 +982,10 @@ sub _setup_conf($$)
 {
   my ($parser, $conf) = @_;
 
+  # for get_conf, set for all the configuration keys that are also in
+  # %Texinfo::Common::default_parser_customization_values to the
+  # values set at parser initialization
+  $parser->{'conf'} = {};
   $parser->{'set'} = {};
   if (defined($conf)) {
     foreach my $key (keys(%$conf)) {
@@ -989,13 +1022,6 @@ sub _setup_conf($$)
 #                             pending text or line.
 
 # context stack functions
-sub _init_context_stack($)
-{
-  my $self = shift;
-  $self->{'context_stack'} = ['_root'];
-  $self->{'context_command_stack'} = [''];
-}
-
 sub _push_context($$$)
 {
   my ($self, $context, $command) = @_;



reply via email to

[Prev in Thread] Current Thread [Next in Thread]