# XML::Parser # # Copyright (c) 1998-2000 Larry Wall and Clark Cooper # All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. package XML::Parser; use Carp; use IO::File; BEGIN { require XML::Parser::Expat; $VERSION = '2.30'; die "Parser.pm and Expat.pm versions don't match" unless $VERSION eq $XML::Parser::Expat::VERSION; } use strict; use vars qw($VERSION %Built_In_Styles $LWP_load_failed); $LWP_load_failed = 0; sub new { my ($class, %args) = @_; my $style = $args{Style}; my $nonexopt = $args{Non_Expat_Options} ||= {}; $nonexopt->{Style} = 1; $nonexopt->{Non_Expat_Options} = 1; $nonexopt->{Handlers} = 1; $nonexopt->{_HNDL_TYPES} = 1; $nonexopt->{NoLWP} = 1; $args{_HNDL_TYPES} = {%XML::Parser::Expat::Handler_Setters}; $args{_HNDL_TYPES}->{Init} = 1; $args{_HNDL_TYPES}->{Final} = 1; $args{Handlers} ||= {}; my $handlers = $args{Handlers}; if (defined($style)) { my $stylepkg = $style; if ($stylepkg !~ /::/) { $stylepkg = "\u$style"; # I'm using the Built_In_Styles hash to define # valid internal styles, since a style doesn't # need to define any particular Handler to be valid. # So I can't check for the existence of a particular sub. croak "Undefined style: $style" unless defined($Built_In_Styles{$stylepkg}); $stylepkg = 'XML::Parser::' . $stylepkg; } my $htype; foreach $htype (keys %{$args{_HNDL_TYPES}}) { # Handlers explicity given override # handlers from the Style package unless (defined($handlers->{$htype})) { # A handler in the style package must either have # exactly the right case as the type name or a # completely lower case version of it. my $hname = "${stylepkg}::$htype"; if (defined(&$hname)) { $handlers->{$htype} = \&$hname; next; } $hname = "${stylepkg}::\L$htype"; if (defined(&$hname)) { $handlers->{$htype} = \&$hname; next; } } } } unless (defined($handlers->{ExternEnt}) or defined ($handlers->{ExternEntFin})) { if ($args{NoLWP} or $LWP_load_failed) { $handlers->{ExternEnt} = \&file_ext_ent_handler; $handlers->{ExternEntFin} = \&file_ext_ent_cleanup; } else { # The following just bootstraps the real LWP external entity # handler $handlers->{ExternEnt} = \&initial_ext_ent_handler; # No cleanup function available until LWPExternEnt.pl loaded } } $args{Pkg} ||= caller; bless \%args, $class; } # End of new sub setHandlers { my ($self, @handler_pairs) = @_; croak("Uneven number of arguments to setHandlers method") if (int(@handler_pairs) & 1); my @ret; while (@handler_pairs) { my $type = shift @handler_pairs; my $handler = shift @handler_pairs; unless (defined($self->{_HNDL_TYPES}->{$type})) { my @types = sort keys %{$self->{_HNDL_TYPES}}; croak("Unknown Parser handler type: $type\n Valid types: @types"); } push(@ret, $type, $self->{Handlers}->{$type}); $self->{Handlers}->{$type} = $handler; } return @ret; } # End of setHandlers sub parse_start { my $self = shift; my @expat_options = (); my ($key, $val); while (($key, $val) = each %{$self}) { push (@expat_options, $key, $val) unless exists $self->{Non_Expat_Options}->{$key}; } my %handlers = %{$self->{Handlers}}; my $init = delete $handlers{Init}; my $final = delete $handlers{Final}; my $expatnb = new XML::Parser::ExpatNB(@expat_options, @_); $expatnb->setHandlers(%handlers); &$init($expatnb) if defined($init); $expatnb->{_State_} = 1; $expatnb->{FinalHandler} = $final if defined($final); return $expatnb; } sub parse { my $self = shift; my $arg = shift; my @expat_options = (); my ($key, $val); while (($key, $val) = each %{$self}) { push(@expat_options, $key, $val) unless exists $self->{Non_Expat_Options}->{$key}; } my $expat = new XML::Parser::Expat(@expat_options, @_); my %handlers = %{$self->{Handlers}}; my $init = delete $handlers{Init}; my $final = delete $handlers{Final}; $expat->setHandlers(%handlers); if ($self->{Base}) { $expat->base($self->{Base}); } &$init($expat) if defined($init); my @result = (); my $result; eval { $result = $expat->parse($arg); }; my $err = $@; if ($err) { $expat->release; die $err; } if ($result and defined($final)) { if (wantarray) { @result = &$final($expat); } else { $result = &$final($expat); } } $expat->release; return unless defined wantarray; return wantarray ? @result : $result; } # End of parse sub parsestring { my $self = shift; $self->parse(@_); } # End of parsestring sub parsefile { my $self = shift; my $file = shift; local(*FILE); open(FILE, $file) or croak "Couldn't open $file:\n$!"; binmode(FILE); my @ret; my $ret; $self->{Base} = $file; if (wantarray) { eval { @ret = $self->parse(*FILE, @_); }; } else { eval { $ret = $self->parse(*FILE, @_); }; } my $err = $@; close(FILE); die $err if $err; return unless defined wantarray; return wantarray ? @ret : $ret; } # End of parsefile sub initial_ext_ent_handler { # This just bootstraps in the real lwp_ext_ent_handler which # also loads the URI and LWP modules. unless ($LWP_load_failed) { local($^W) = 0; my $stat = eval { require('XML/Parser/LWPExternEnt.pl'); }; if ($stat) { $_[0]->setHandlers(ExternEnt => \&lwp_ext_ent_handler, ExternEntFin => \&lwp_ext_ent_cleanup); goto &lwp_ext_ent_handler; } # Failed to load lwp handler, act as if NoLWP $LWP_load_failed = 1; my $cmsg = "Couldn't load LWP based external entity handler\n"; $cmsg .= "Switching to file-based external entity handler\n"; $cmsg .= " (To avoid this message, use NoLWP option to XML::Parser)\n"; warn($cmsg); } $_[0]->setHandlers(ExternEnt => \&file_ext_ent_handler, ExternEntFin => \&file_ext_ent_cleanup); goto &file_ext_ent_handler; } # End initial_ext_ent_handler sub file_ext_ent_handler { my ($xp, $base, $path) = @_; # Prepend base only for relative paths if (defined($base) and not ($path =~ m!^(?:[\\/]|\w+:)!)) { my $newpath = $base; $newpath =~ s![^\\/:]*$!$path!; $path = $newpath; } if ($path =~ /^\s*[|>+]/ or $path =~ /\|\s*$/) { $xp->{ErrorMessage} .= "System ID ($path) contains Perl IO control characters"; return undef; } my $fh = new IO::File($path); unless (defined $fh) { $xp->{ErrorMessage} .= "Failed to open $path:\n$!"; return undef; } $xp->{_BaseStack} ||= []; $xp->{_FhStack} ||= []; push(@{$xp->{_BaseStack}}, $base); push(@{$xp->{_FhStack}}, $fh); $xp->base($path); return $fh; } # End file_ext_ent_handler sub file_ext_ent_cleanup { my ($xp) = @_; my $fh = pop(@{$xp->{_FhStack}}); $fh->close; my $base = pop(@{$xp->{_BaseStack}}); $xp->base($base); } # End file_ext_ent_cleanup ################################################################### package XML::Parser::Debug; $XML::Parser::Built_In_Styles{Debug} = 1; sub Start { my $expat = shift; my $tag = shift; print STDERR "@{$expat->{Context}} \\\\ (@_)\n"; } sub End { my $expat = shift; my $tag = shift; print STDERR "@{$expat->{Context}} //\n"; } sub Char { my $expat = shift; my $text = shift; $text =~ s/([\x80-\xff])/sprintf "#x%X;", ord $1/eg; $text =~ s/([\t\n])/sprintf "#%d;", ord $1/eg; print STDERR "@{$expat->{Context}} || $text\n"; } sub Proc { my $expat = shift; my $target = shift; my $text = shift; my @foo = @{$expat->{Context}}; print STDERR "@foo $target($text)\n"; } ################################################################### package XML::Parser::Subs; $XML::Parser::Built_In_Styles{Subs} = 1; sub Start { no strict 'refs'; my $expat = shift; my $tag = shift; my $sub = $expat->{Pkg} . "::$tag"; eval { &$sub($expat, $tag, @_) }; } sub End { no strict 'refs'; my $expat = shift; my $tag = shift; my $sub = $expat->{Pkg} . "::${tag}_"; eval { &$sub($expat, $tag) }; } ################################################################### package XML::Parser::Tree; $XML::Parser::Built_In_Styles{Tree} = 1; sub Init { my $expat = shift; $expat->{Lists} = []; $expat->{Curlist} = $expat->{Tree} = []; } sub Start { my $expat = shift; my $tag = shift; my $newlist = [ { @_ } ]; push @{ $expat->{Lists} }, $expat->{Curlist}; push @{ $expat->{Curlist} }, $tag => $newlist; $expat->{Curlist} = $newlist; } sub End { my $expat = shift; my $tag = shift; $expat->{Curlist} = pop @{ $expat->{Lists} }; } sub Char { my $expat = shift; my $text = shift; my $clist = $expat->{Curlist}; my $pos = $#$clist; if ($pos > 0 and $clist->[$pos - 1] eq '0') { $clist->[$pos] .= $text; } else { push @$clist, 0 => $text; } } sub Final { my $expat = shift; delete $expat->{Curlist}; delete $expat->{Lists}; $expat->{Tree}; } ################################################################### package XML::Parser::Objects; $XML::Parser::Built_In_Styles{Objects} = 1; sub Init { my $expat = shift; $expat->{Lists} = []; $expat->{Curlist} = $expat->{Tree} = []; } sub Start { my $expat = shift; my $tag = shift; my $newlist = [ ]; my $class = "${$expat}{Pkg}::$tag"; my $newobj = bless { @_, Kids => $newlist }, $class; push @{ $expat->{Lists} }, $expat->{Curlist}; push @{ $expat->{Curlist} }, $newobj; $expat->{Curlist} = $newlist; } sub End { my $expat = shift; my $tag = shift; $expat->{Curlist} = pop @{ $expat->{Lists} }; } sub Char { my $expat = shift; my $text = shift; my $class = "${$expat}{Pkg}::Characters"; my $clist = $expat->{Curlist}; my $pos = $#$clist; if ($pos >= 0 and ref($clist->[$pos]) eq $class) { $clist->[$pos]->{Text} .= $text; } else { push @$clist, bless { Text => $text }, $class; } } sub Final { my $expat = shift; delete $expat->{Curlist}; delete $expat->{Lists}; $expat->{Tree}; } ################################################################ package XML::Parser::Stream; $XML::Parser::Built_In_Styles{Stream} = 1; # This style invented by Tim Bray sub Init { no strict 'refs'; my $expat = shift; $expat->{Text} = ''; my $sub = $expat->{Pkg} ."::StartDocument"; &$sub($expat) if defined(&$sub); } sub Start { no strict 'refs'; my $expat = shift; my $type = shift; doText($expat); $_ = "<$type"; %_ = @_; while (@_) { $_ .= ' ' . shift() . '="' . shift() . '"'; } $_ .= '>'; my $sub = $expat->{Pkg} . "::StartTag"; if (defined(&$sub)) { &$sub($expat, $type); } else { print; } } sub End { no strict 'refs'; my $expat = shift; my $type = shift; # Set right context for Text handler push(@{$expat->{Context}}, $type); doText($expat); pop(@{$expat->{Context}}); $_ = ""; my $sub = $expat->{Pkg} . "::EndTag"; if (defined(&$sub)) { &$sub($expat, $type); } else { print; } } sub Char { my $expat = shift; $expat->{Text} .= shift; } sub Proc { no strict 'refs'; my $expat = shift; my $target = shift; my $text = shift; doText($expat); $_ = ""; my $sub = $expat->{Pkg} . "::PI"; if (defined(&$sub)) { &$sub($expat, $target, $text); } else { print; } } sub Final { no strict 'refs'; my $expat = shift; my $sub = $expat->{Pkg} . "::EndDocument"; &$sub($expat) if defined(&$sub); } sub doText { no strict 'refs'; my $expat = shift; $_ = $expat->{Text}; if (length($_)) { my $sub = $expat->{Pkg} . "::Text"; if (defined(&$sub)) { &$sub($expat); } else { print; } $expat->{Text} = ''; } } 1; __END__ =head1 NAME XML::Parser - A perl module for parsing XML documents =head1 SYNOPSIS use XML::Parser; $p1 = new XML::Parser(Style => 'Debug'); $p1->parsefile('REC-xml-19980210.xml'); $p1->parse('Hello World'); # Alternative $p2 = new XML::Parser(Handlers => {Start => \&handle_start, End => \&handle_end, Char => \&handle_char}); $p2->parse($socket); # Another alternative $p3 = new XML::Parser(ErrorContext => 2); $p3->setHandlers(Char => \&text, Default => \&other); open(FOO, 'xmlgenerator |'); $p3->parse(*FOO, ProtocolEncoding => 'ISO-8859-1'); close(FOO); $p3->parsefile('junk.xml', ErrorContext => 3); =begin man .ds PI PI =end man =head1 DESCRIPTION This module provides ways to parse XML documents. It is built on top of L, which is a lower level interface to James Clark's expat library. Each call to one of the parsing methods creates a new instance of XML::Parser::Expat which is then used to parse the document. Expat options may be provided when the XML::Parser object is created. These options are then passed on to the Expat object on each parse call. They can also be given as extra arguments to the parse methods, in which case they override options given at XML::Parser creation time. The behavior of the parser is controlled either by C> and/or C> options, or by L method. These all provide mechanisms for XML::Parser to set the handlers needed by XML::Parser::Expat. If neither C