[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Koha-cvs] CVS: koha/misc/translator TmplToken.pm,1.3,1.4 TmplTokenizer.
From: |
Ambrose C. LI |
Subject: |
[Koha-cvs] CVS: koha/misc/translator TmplToken.pm,1.3,1.4 TmplTokenizer.pm,1.21,1.22 tmpl_process3.pl,1.4,1.5 xgettext.pl,1.3,1.4 |
Date: |
Sun, 22 Feb 2004 13:34:44 -0800 |
Update of /cvsroot/koha/koha/misc/translator
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv471
Modified Files:
TmplToken.pm TmplTokenizer.pm tmpl_process3.pl xgettext.pl
Log Message:
Preliminary support for "analysis" of strings with <a> tags.
Early termination of analysis if we encounter some strings, such as </h1>
or | or ||, in order to avoid extracting strings that are unnecessarily
long and which doesn't add any meaningful context.
Index: TmplToken.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplToken.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** TmplToken.pm 19 Feb 2004 21:24:30 -0000 1.3
--- TmplToken.pm 22 Feb 2004 21:34:40 -0000 1.4
***************
*** 89,92 ****
--- 89,98 ----
# only meaningful for TEXT_PARAMETRIZED tokens
+ sub anchors {
+ my $this = shift;
+ return map { $_->type == TmplTokenType::TAG && $_->string =~ /^<a\b/is?
$_: ()} @{$this->{'_kids'}};
+ }
+
+ # only meaningful for TEXT_PARAMETRIZED tokens
sub form {
my $this = shift;
Index: TmplTokenizer.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplTokenizer.pm,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -r1.21 -r1.22
*** TmplTokenizer.pm 22 Feb 2004 09:04:53 -0000 1.21
--- TmplTokenizer.pm 22 Feb 2004 21:34:40 -0000 1.22
***************
*** 421,431 ****
}
! sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
my($t) = @_;
! return $t->type == TmplTokenType::TEXT
|| ($t->type == TmplTokenType::DIRECTIVE
&& $t->string =~ /^(?:$re_tmpl_var)$/os)
|| ($t->type == TmplTokenType::TAG
! && ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
|| ($t->string =~ /^<input/i
&& $t->attributes->{'type'} =~ /^(?:text)$/i)))
--- 421,442 ----
}
! sub _token_groupable1_p ($) { # as first token, groupable into
TEXT_PARAMETRIZED
my($t) = @_;
! return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
|| ($t->type == TmplTokenType::DIRECTIVE
&& $t->string =~ /^(?:$re_tmpl_var)$/os)
|| ($t->type == TmplTokenType::TAG
! && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
! || ($t->string =~ /^<input/i
! && $t->attributes->{'type'} =~ /^(?:text)$/i)))
! }
!
! sub _token_groupable2_p ($) { # as other token, groupable into
TEXT_PARAMETRIZED
! my($t) = @_;
! return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s ||
$t->string !~ /^[\|\s]+$/s))
! || ($t->type == TmplTokenType::DIRECTIVE
! && $t->string =~ /^(?:$re_tmpl_var)$/os)
! || ($t->type == TmplTokenType::TAG
! && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
|| ($t->string =~ /^<input/i
&& $t->attributes->{'type'} =~ /^(?:text)$/i)))
***************
*** 440,444 ****
sub _formalize ($) {
my($t) = @_;
! return $t->type == TmplTokenType::DIRECTIVE? '%s':
_quote_cformat($t->string);
}
--- 451,458 ----
sub _formalize ($) {
my($t) = @_;
! return $t->type == TmplTokenType::DIRECTIVE? '%s':
! $t->type == TmplTokenType::TAG?
! ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
! _quote_cformat($t->string);
}
***************
*** 453,456 ****
--- 467,471 ----
}
};
+ &$undo_trailing_blanks;
# FIXME: If the last token is a close tag but there are no tags
# FIXME: before it, drop the close tag back into the queue. This
***************
*** 515,558 ****
if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
&& ($it->type == TmplTokenType::TEXT?
! !blank_p( $it->string ): _token_groupable_p( $it ))) {
my @structure = ( $it );
! my($n_trailing_spaces, $next) = (0, undef);
! my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
if ($it->type == TmplTokenType::TEXT) {
$nonblank_text_p = 1 if !blank_p( $it->string );
} elsif ($it->type == TmplTokenType::DIRECTIVE) {
$parametrized_p = 1;
}
! for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
$next = $this->_next_token_intermediate($h);
push @structure, $next; # for consistency (with initialization)
! last unless defined $next && _token_groupable_p( $next );
if ($next->type == TmplTokenType::TEXT) {
! if (blank_p( $next->string )) {
! $n_trailing_spaces += 1;
! } else {
! ($n_trailing_spaces, $nonblank_text_p) = (0, 1);
! }
} elsif ($next->type == TmplTokenType::DIRECTIVE) {
- $n_trailing_spaces = 0;
$parametrized_p = 1;
! } else {
! $n_trailing_spaces = 0;
}
}
# Undo the last token
push @{$this->{_queue}}, pop @structure;
! # Undo trailing blank tokens
! for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
! push @{$this->{_queue}}, pop @structure;
! }
@structure = $this->_optimize( @structure );
if (@structure < 2) {
# Nothing to do
;
! } elsif ($nonblank_text_p && $parametrized_p) {
# Create the corresponding c-format string
my $string = join('', map { $_->string } @structure);
my $form = join('', map { _formalize $_ } @structure);
$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
$it->line_number, $it->pathname);
$it->set_form( $form );
--- 530,583 ----
if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
&& ($it->type == TmplTokenType::TEXT?
! !blank_p( $it->string ): _token_groupable1_p( $it ))) {
my @structure = ( $it );
! my @tags = ();
! my $next = undef;
! my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
if ($it->type == TmplTokenType::TEXT) {
$nonblank_text_p = 1 if !blank_p( $it->string );
} elsif ($it->type == TmplTokenType::DIRECTIVE) {
$parametrized_p = 1;
+ } elsif ($it->type == TmplTokenType::TAG && $it->string =~
/^<([A-Z0-9]+)/is) {
+ push @tags, lc($1);
+ $with_anchor_p = 1 if lc($1) eq 'a';
}
! # We hate | and || in msgid strings, so we try to avoid them
! for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type ==
TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
$next = $this->_next_token_intermediate($h);
push @structure, $next; # for consistency (with initialization)
! last unless defined $next && _token_groupable2_p( $next );
! last if $quit_next_p;
if ($next->type == TmplTokenType::TEXT) {
! $nonblank_text_p = 1 if !blank_p( $next->string );
! $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
} elsif ($next->type == TmplTokenType::DIRECTIVE) {
$parametrized_p = 1;
! } elsif ($next->type == TmplTokenType::TAG) {
! if ($next->string =~ /^<([A-Z0-9]+)/is) {
! push @tags, lc($1);
! $with_anchor_p = 1 if lc($1) eq 'a';
! } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
! my $close = lc($1);
! $quit_p = 1 unless @tags && $close eq $tags[$#tags];
! $quit_next_p = 1 if $close =~ /^h\d$/;
! pop @tags;
! }
}
+ last if $quit_p;
}
# Undo the last token
push @{$this->{_queue}}, pop @structure;
! # Simply it a bit more
@structure = $this->_optimize( @structure );
if (@structure < 2) {
# Nothing to do
;
! } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
# Create the corresponding c-format string
my $string = join('', map { $_->string } @structure);
my $form = join('', map { _formalize $_ } @structure);
+ my $a_counter = 0;
+ $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
$it->line_number, $it->pathname);
$it->set_form( $form );
***************
*** 605,612 ****
# Some functions that shouldn't be here... should be moved out some time
! sub parametrize ($@) {
! my($fmt, @params) = @_;
my $it = '';
! for (my $n = 0; length $fmt;) {
if ($fmt =~ /^[^%]+/) {
$fmt = $';
--- 630,637 ----
# Some functions that shouldn't be here... should be moved out some time
! sub parametrize ($$$) {
! my($fmt_0, $params, $anchors) = @_;
my $it = '';
! for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
if ($fmt =~ /^[^%]+/) {
$fmt = $';
***************
*** 620,624 ****
$fmt = $';
if (!defined $width && !defined $prec) {
! $it .= $params[$i]
} elsif (defined $width && defined $prec && !$width && !$prec) {
;
--- 645,649 ----
$fmt = $';
if (!defined $width && !defined $prec) {
! $it .= $params->[$i]
} elsif (defined $width && defined $prec && !$width && !$prec) {
;
***************
*** 634,637 ****
--- 659,678 ----
}
}
+ for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
+ if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
+ $fmt = $';
+ $it .= $&;
+ } elsif ($fmt =~ /^<a(\d+)>/is) {
+ $n += 1;
+ my $i = $1;
+ $fmt = $';
+ my $anchor = $anchors->[$i - 1];
+ warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef
#FIXME
+ unless defined $anchor;
+ $it .= $anchor->string;
+ } else {
+ die "Completely confused decoding anchors: $fmt\n";#XXX
+ }
+ }
return $it;
}
Index: tmpl_process3.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/tmpl_process3.pl,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -r1.4 -r1.5
*** tmpl_process3.pl 22 Feb 2004 08:18:27 -0000 1.4
--- tmpl_process3.pl 22 Feb 2004 21:34:40 -0000 1.5
***************
*** 89,96 ****
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
my $fmt = find_translation($s->form);
! print $output TmplTokenizer::parametrize($fmt, map {
my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
$kind == TmplTokenType::TAG && %$attr?
! text_replace_tag($t, $attr): $t } $s->parameters);
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
print $output text_replace_tag($t, $attr);
--- 89,96 ----
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
my $fmt = find_translation($s->form);
! print $output TmplTokenizer::parametrize($fmt, [ map {
my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
$kind == TmplTokenType::TAG && %$attr?
! text_replace_tag($t, $attr): $t } $s->parameters ], [
$s->anchors ]);
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
print $output text_replace_tag($t, $attr);
***************
*** 298,302 ****
This is an experimental version of the tmpl_process.pl script,
! using standard gettext-style PO files.
Currently, the create, update, and install actions have all been
--- 298,303 ----
This is an experimental version of the tmpl_process.pl script,
! using standard gettext-style PO files. Note that the behaviour
! of this script should still be considered unstable.
Currently, the create, update, and install actions have all been
***************
*** 316,321 ****
The --help option has not been implemented yet.
! There are probably some real bugs too, since this has not been
! tested very much.
xgettext.pl must be present in the current directory; the
--- 317,328 ----
The --help option has not been implemented yet.
! If an extracted string contain actual text (versus tags or
! TMPL_VAR directives), the strings are extracted verbatim,
! resulting in unwieldy things like multiple spaces, tabs,
! and/or newlines which are semantically indistinguishable
! from single blanks. If the template writer changes the
! spacing just a little bit, the new formatting would be
! considered new strings. This is arguably wrong, and in any
! case counter-productive.
xgettext.pl must be present in the current directory; the
***************
*** 332,335 ****
--- 339,345 ----
(e.g., to get rid of the "Strange line" warning for #~).
+ There are probably some other bugs too, since this has not been
+ tested very much.
+
=head1 SEE ALSO
Index: xgettext.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/xgettext.pl,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** xgettext.pl 22 Feb 2004 06:46:17 -0000 1.3
--- xgettext.pl 22 Feb 2004 21:34:40 -0000 1.4
***************
*** 22,37 ****
###############################################################################
! sub negligible_p ($) {
my($t) = @_; # a string
# Don't emit pure whitespace, pure numbers, pure punctuation,
# single letters, or TMPL_VAR's.
# Punctuation should arguably be translated. But without context
! # they are untranslatable.
return !$extract_all_p && (
! TmplTokenizer::blank_p($t) # blank or TMPL_VAR
|| $t =~ /^\d+$/ # purely digits
! || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
|| $t =~ /^[A-Za-z]$/ # single letters
! );
}
--- 22,51 ----
###############################################################################
! sub string_negligible_p ($) {
my($t) = @_; # a string
# Don't emit pure whitespace, pure numbers, pure punctuation,
# single letters, or TMPL_VAR's.
# Punctuation should arguably be translated. But without context
! # they are untranslatable. Note that $t is a string, not a token object.
return !$extract_all_p && (
! TmplTokenizer::blank_p($t) # blank or TMPL_VAR
|| $t =~ /^\d+$/ # purely digits
! || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
|| $t =~ /^[A-Za-z]$/ # single letters
! )
! }
!
! sub token_negligible_p( $ ) {
! my($x) = @_;
! my $t = $x->type;
! return !$extract_all_p && (
! $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
! $t == TmplTokenType::DIRECTIVE? 1:
! $t == TmplTokenType::TEXT_PARAMETRIZED
! && join( '', map { my $t = $_->type;
! $t == TmplTokenType::DIRECTIVE?
! '1': $t == TmplTokenType::TAG?
! '': token_negligible_p( $_ )?
! '': '1' } @{$x->children} ) eq '' );
}
***************
*** 40,45 ****
sub remember ($$) {
my($token, $string) = @_;
! $text{$string} = [] unless defined $text{$string};
! push @{$text{$string}}, $token;
}
--- 54,62 ----
sub remember ($$) {
my($token, $string) = @_;
! # If we determine that the string is negligible, don't bother to remember
! unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
! $text{$string} = [] unless defined $text{$string};
! push @{$text{$string}}, $token;
! }
}
***************
*** 70,77 ****
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
if ($kind eq TmplTokenType::TEXT) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $t ) if $t =~ /\S/s;
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $s->form ) if $s->form =~ /\S/s;
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
--- 87,92 ----
***************
*** 97,101 ****
# Emit all extracted strings.
for my $t (string_list) {
! printf OUTPUT "%s\n", $t unless negligible_p($t);
}
}
--- 112,116 ----
# Emit all extracted strings.
for my $t (string_list) {
! printf OUTPUT "%s\n", $t # unless negligible_p($t);
}
}
***************
*** 128,132 ****
my $directory_re = quotemeta("$directory/");
for my $t (string_list) {
! next if negligible_p($t);
my $cformat_p;
for my $token (@{$text{$t}}) {
--- 143,147 ----
my $directory_re = quotemeta("$directory/");
for my $t (string_list) {
! #next if negligible_p($t);
my $cformat_p;
for my $token (@{$text{$t}}) {
***************
*** 317,331 ****
=back
! Right now it does about the same thing as text-extract2.pl but
! generates gettext-style output; however, because it is scanner-
! instead of parser-based, it is able to address the 4 weaknesses
! listed in translator_doc.txt. Ultimately, the goal is to make
! this able to do some kind of simple analysis on the input to
! produce gettext-style output with c-format strings, in order to
! facilitate translation to languages with a different word order
! than English.
! When the above is finished, the generated po file may contain
! some HTML tags in addition to %s strings.
If you want to generate GNOME-style POTFILES.in files, such
--- 332,340 ----
=back
! Note that this script is experimental and should still be
! considered unstable.
! Please refer to the explanation in tmpl_process3 for further
! details.
If you want to generate GNOME-style POTFILES.in files, such
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Koha-cvs] CVS: koha/misc/translator TmplToken.pm,1.3,1.4 TmplTokenizer.pm,1.21,1.22 tmpl_process3.pl,1.4,1.5 xgettext.pl,1.3,1.4,
Ambrose C. LI <=