[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[6248] Paragraph.pm add_text use split
From: |
Gavin D. Smith |
Subject: |
[6248] Paragraph.pm add_text use split |
Date: |
Mon, 04 May 2015 22:52:17 +0000 |
Revision: 6248
http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=6248
Author: gavin
Date: 2015-05-04 22:52:16 +0000 (Mon, 04 May 2015)
Log Message:
-----------
Paragraph.pm add_text use split
Modified Paths:
--------------
trunk/ChangeLog
trunk/tp/Texinfo/Convert/Paragraph.pm
Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog 2015-05-04 22:06:03 UTC (rev 6247)
+++ trunk/ChangeLog 2015-05-04 22:52:16 UTC (rev 6248)
@@ -3,6 +3,15 @@
* tp/Texinfo/Convert/Unicode.pm (string_width): Calculate string
width using "unpack" function.
+ * tp/Texinfo/Convert/Paragraph.pm
+
+ (_add_next): Add new argument "$newlines_impossible" to elimate
+ a regex match. Don't access arguments we don't need.
+ (add_text): Use "split" function to split up text.
+ Try to eliminate regex matches for a newline character.
+ Save some flags in local variables. Reorder a condition. Add
+ "o" flag to some regexes that used variables.
+
2015-05-03 Karl Berry <address@hidden>
* pretest 5.9.91.
Modified: trunk/tp/Texinfo/Convert/Paragraph.pm
===================================================================
--- trunk/tp/Texinfo/Convert/Paragraph.pm 2015-05-04 22:06:03 UTC (rev
6247)
+++ trunk/tp/Texinfo/Convert/Paragraph.pm 2015-05-04 22:52:16 UTC (rev
6248)
@@ -181,19 +181,20 @@
}
# add a word and/or spaces and end of sentence.
-sub _add_next($;$$$$$)
+sub _add_next($;$$$$$$)
{
- my $paragraph = shift;
- my $word = shift;
- my $underlying_word = shift;
- my $space = shift;
- my $end_sentence = shift;
- my $transparent = shift;
+ my $paragraph = $_[0];
+ my $word = $_[1];
+ my $space = $_[3];
+ my $end_sentence = $_[4];
+ my $transparent = $_[5];
my $result = '';
- $underlying_word = $word if (!defined($underlying_word));
+ if (defined($word)) {
+ my $underlying_word = $_[2];
+ my $newlines_impossible = $_[6];
+ $underlying_word = $word if (!defined($underlying_word));
- if (defined($word)) {
if (!defined($paragraph->{'word'})) {
$paragraph->{'word'} = '';
$paragraph->{'underlying_word'} = '';
@@ -212,7 +213,7 @@
$paragraph->{'word'} .= $word;
$paragraph->{'underlying_word'} .= $underlying_word unless($transparent);
- if ($word =~ /\n/) {
+ if (!$newlines_impossible and $word =~ /\n/) {
$result .= $paragraph->{'space'};
$paragraph->{'space'} = '';
$result .= $paragraph->{'word'};
@@ -321,21 +322,36 @@
$paragraph->{'end_line_count'} = 0;
my $result = '';
- while ($text ne '') {
- if ($paragraph->{'DEBUG'}) {
+ my $protect_spaces_flag = $paragraph->{'protect_spaces'};
+
+ my @segments = split
+/([^\S\x{202f}\x{00a0}]+)|(\p{InFullwidth})|((?:[^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)/,
+ $text;
+
+ # Check now if a newline exists anywhere in the string to
+ # try to eliminate regex checks later.
+ my $newline_possible_flag = ($text =~ /\n/);
+
+ my $debug_flag = $paragraph->{'DEBUG'};
+ while (@segments) {
+ # $empty_segment should be an empty string; the other variables
+ # here were recognized as field separators by splice.
+ my ($empty_segment, $spaces, $fullwidth_segment, $added_word)
+ = splice (@segments, 0, 4);
+
+ if ($debug_flag) {
my $word = 'UNDEF';
$word = $paragraph->{'word'} if (defined($paragraph->{'word'}));
print STDERR "p ($paragraph->{'counter'}+$paragraph->{'word_counter'}) s
`"._print_escaped_spaces($paragraph->{'space'})."', w `$word'\n";
#print STDERR "TEXT: "._print_escaped_spaces($text)."|\n"
}
# \x{202f}\x{00a0} are non breaking spaces
- if ($text =~ s/^([^\S\x{202f}\x{00a0}]+)//) {
- my $spaces = $1;
+ if (defined $spaces) {
$underlying_text =~ s/^([^\S\x{202f}\x{00a0}]+)//
if defined($underlying_text);
- print STDERR "SPACES($paragraph->{'counter'})
`"._print_escaped_spaces($spaces)."'\n" if ($paragraph->{'DEBUG'});
+ print STDERR "SPACES($paragraph->{'counter'})
`"._print_escaped_spaces($spaces)."'\n" if $debug_flag;
#my $added_word = $paragraph->{'word'};
- if ($paragraph->{'protect_spaces'}) {
+ if ($protect_spaces_flag) {
$paragraph->{'word'} .= $spaces;
$paragraph->{'underlying_word'} .= $spaces;
$paragraph->{'word_counter'} += length($spaces);
@@ -368,9 +384,9 @@
} else {
$result .= _add_pending_word($paragraph);
if ($paragraph->{'counter'} != 0) {
- if (!$paragraph->{'frenchspacing'}
- and $paragraph->{'end_sentence'}
- and $paragraph->{'end_sentence'} > 0) {
+ if ($paragraph->{'end_sentence'}
+ and $paragraph->{'end_sentence'} > 0
+ and !$paragraph->{'frenchspacing'}) {
if (length($paragraph->{'space'}) >= 1 or length($spaces) > 1) {
# more than one space, we can make sure tht there are only
# 2 spaces
@@ -386,9 +402,11 @@
$paragraph->{'space'} = $new_space;
}
} else {
- my $new_space = substr($spaces, 0, 1);
- $new_space =~ s/^[\n\r]/ /;
- $paragraph->{'space'} = $new_space;
+ $paragraph->{'space'} = substr($spaces, 0, 1);
+ if ($paragraph->{'space'} eq "\n"
+ or $paragraph->{'space'} eq "\r") {
+ $paragraph->{'space'} = " ";
+ }
}
}
}
@@ -399,37 +417,11 @@
> $paragraph->{'max'}) {
$result .= _cut_line($paragraph);
}
- if ($spaces =~ /\n/ and $paragraph->{'keep_end_lines'}) {
+ if ($newline_possible_flag
+ and $paragraph->{'keep_end_lines'} and $spaces =~ /\n/) {
$result .= _end_line($paragraph);
}
- } elsif ($text =~ s/^(\p{InFullwidth})//) {
- my $added = $1;
- my $underlying_added;
- if (defined($underlying_text)) {
- $underlying_text =~ s/^(\p{InFullwidth})//;
- $underlying_added = $1;
- } else {
- $underlying_added = $added;
- }
-
- print STDERR "EAST_ASIAN\n" if ($paragraph->{'DEBUG'});
- if (!defined($paragraph->{'word'})) {
- $paragraph->{'word'} = '';
- $paragraph->{'underlying_word'} = '';
- }
- $paragraph->{'word'} .= $added;
- $paragraph->{'underlying_word'} .= $underlying_added;
- $paragraph->{'word_counter'} += 2;
- if ($paragraph->{'counter'} != 0 and
- $paragraph->{'counter'} + $paragraph->{'word_counter'}
- > $paragraph->{'max'}) {
- $result .= _cut_line($paragraph);
- }
- $result .= _add_pending_word($paragraph);
- delete $paragraph->{'end_sentence'};
- $paragraph->{'space'} = '';
- } elsif ($text =~ s/^(([^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)//) {
- my $added_word = $1;
+ } elsif (defined $added_word) {
my $underlying_added_word;
if (defined($underlying_text)) {
$underlying_text =~ s/^(([^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)//;
@@ -438,14 +430,15 @@
$underlying_added_word = $added_word;
}
- $result .= _add_next($paragraph, $added_word, $underlying_added_word);
+ $result .= _add_next($paragraph, $added_word, $underlying_added_word,
+ undef, undef, undef, !$newline_possible_flag);
# now check if it is considered as an end of sentence
if (defined($paragraph->{'end_sentence'})
- and $underlying_added_word =~ /^[$after_punctuation_characters]*$/) {
+ and $underlying_added_word =~ /^[$after_punctuation_characters]*$/o)
{
# do nothing in the case of a continuation of
after_punctuation_characters
- } elsif ($paragraph->{'underlying_word'} =~
/[$end_sentence_character][$after_punctuation_characters]*$/
- and $paragraph->{'underlying_word'} !~
/[[:upper:]][$end_sentence_character$after_punctuation_characters]*$/) {
+ } elsif ($paragraph->{'underlying_word'} =~
/[$end_sentence_character][$after_punctuation_characters]*$/o
+ and $paragraph->{'underlying_word'} !~
/[[:upper:]][$end_sentence_character$after_punctuation_characters]*$/o) {
if ($paragraph->{'frenchspacing'}) {
$paragraph->{'end_sentence'} = -1;
} else {
@@ -457,14 +450,31 @@
print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'}):
text\n"
if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'});
}
- } else {
- # Some characters are not handled by the cases above.
- # For example, it happened for strange caracters that seems to be
- # some special spaces. It is a bit strange since the cases above
- # include a possibility and the complement. Maybe a character
- # invalid in a given encoding?
- #die "Unknown caracter leading $text";
- last;
+ } elsif (defined $fullwidth_segment) {
+ my $underlying_added;
+ if (defined($underlying_text)) {
+ $underlying_text =~ s/^(\p{InFullwidth})//;
+ $underlying_added = $1;
+ } else {
+ $underlying_added = $fullwidth_segment;
+ }
+
+ print STDERR "EAST_ASIAN\n" if ($paragraph->{'DEBUG'});
+ if (!defined($paragraph->{'word'})) {
+ $paragraph->{'word'} = '';
+ $paragraph->{'underlying_word'} = '';
+ }
+ $paragraph->{'word'} .= $fullwidth_segment;
+ $paragraph->{'underlying_word'} .= $underlying_added;
+ $paragraph->{'word_counter'} += 2;
+ if ($paragraph->{'counter'} != 0 and
+ $paragraph->{'counter'} + $paragraph->{'word_counter'}
+ > $paragraph->{'max'}) {
+ $result .= _cut_line($paragraph);
+ }
+ $result .= _add_pending_word($paragraph);
+ delete $paragraph->{'end_sentence'};
+ $paragraph->{'space'} = '';
}
}
return $result;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [6248] Paragraph.pm add_text use split,
Gavin D. Smith <=