RXParse module v.91 (by robic0)



RXParse, Version .91 (by robic0) 6/8/06

- Non-normalized, parsed internal entities
# Unicode character reference
# General reference
# Parameter reference
- Recursive expansion of general references in content and attvalue
- DTD entity not parsed yet
- Other bug fixes





#################################################################
# XML/Xhtml/Html - RXParse parse/edit/filter module (by robic0)
# ------------------------------------------------------
# Compliant w3c XML: 1.1
# Resources:
# Extensible Markup Language (XML) 1.1
# W3C Recommendation 04 February 2004,
# 15 April 2004
# http://www.w3.org/TR/xml11/#NT-PITarget
#################################################################
$|=1;
package RXParse;
use strict;
use warnings;
use Carp;
use vars qw(@ISA);
@ISA = qw();

my $VERSION = .91;

#==========================
# RXParse package globals
#==========================
my (
%Dflth,
%ErrMsg,
$Nstrt,$Nchar,$Name,
@UC_Nstart,@UC_Nchar,
$RxParseXP1,
$RxAttr,
$RxAttr_DL1,
$RxAttr_DL2,
$RxAttr_RM,
$RxPi,
$RxENTITY,
%dflt_general_ent_subst,
%dflt_parameter_ent_subst
);
my $parsinitflg = 0;

if (!$parsinitflg) {
InitParser();
$parsinitflg = 1;
}

#========================
# RXParse user methods
#========================
sub new
{
my ($class, @args) = @_;
my $self = {};
$self->{'debug'} = 0;
$self->{'ignore_errors'} = 0;
Cleanup($self);
setDfltHandlers($self);
return bless ($self, $class);
}

sub original_content
{
my $self = shift;
if (defined $self->{'origcontent'} &&
ref($self->{'origcontent'}) eq 'SCALAR') {
return ${$self->{'origcontent'}};
}
return "";
}

sub setMode
{
my ($self, @args) = @_;

if (scalar(@args)) {
while (my ($name, $val) = splice (@args, 0, 2)) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;

if (lc($name) eq 'debug') {
$self->{'debug'} = 0;
$self->{'debug'} = 1 if (defined $val && $val);
}
elsif (lc($name) eq 'ignore_errors') {
$self->{'ignore_errors'} = 0;
$self->{'ignore_errors'} = 1 if (defined $val && $val);
}
# add more here
}
}
}

sub setDfltHandlers
{
my ($self, $name) = @_;
if (defined $name) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $Dflth{$hname}) {
$self->{$hname} = $Dflth{$hname};
}
} else {
foreach my $key (keys %Dflth) {
$self->{$key} = $Dflth{$key};

}
}
}

sub setHandlers
{
my ($self, @args) = @_;
my %oldh = ();
if (scalar(@args)) {
while (my ($name, $val) = splice (@args, 0, 2)) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $self->{$hname}) {
$oldh{$name} = $self->{$hname};
if (ref($val) eq 'CODE') {
$self->{$hname} = $val;
} else {
# fatal error if not a CODE ref
throwX($self, 'FATAL', '32', $name);
}
}
}
}
return %oldh;
}

sub parse
{
my ($self, $data, @args) = @_;
if ($self->{'InParse'}) {
# fatal error if already in parse
throwX($self, 'FATAL', '30');
}
unless (defined $data) {
# fatal error if data source not defined
throwX($self, 'FATAL', '31');
}
$self->{'InParse'} = 1;

# use XP1 processor (for now)
$self->{'proctype'} = 'XP1';
if (ref($data) eq 'SCALAR') {
print "SCALAR ref\n" if ($self->{'debug'});
XP1 ($self, 1, $data);
}
elsif (ref(\$data) eq 'SCALAR') {
print "SCALAR string\n" if ($self->{'debug'});
XP1 ($self, 1, \$data);
} else {
if (ref($data) ne 'GLOB' && ref(\$data) ne 'GLOB') {
# data source not string or filehandle, nor reference to one
throwX($self, 'FATAL', '33');
}
print "GLOB ref or filehandle\n" if ($self->{'debug'});
XP1 ($self, 0, $data);
}
Cleanup($self);
}

#==========================
# RXParse non-user methods
#==========================
sub Cleanup
{
my $self = shift;
InitEntities($self);
$self->{'origcontent'} = undef;
$self->{'InParse'} = 0;
}

sub InitEntities
{
my $self = shift;
# initial compiled regexp
$self->{'Entities'} = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?:([0-9]+)|(x[0-9a-fA-F]+)))";
# ( 4 4|5 5)
$self->{'RxEntConv'} = qr/(.*?)(&|%)($self->{'Entities'});/s;
# 1 12 23 3
# initial entity hash
$self->{'general_ent_subst'} = {%dflt_general_ent_subst};
$self->{'parameter_ent_subst'} = {%dflt_parameter_ent_subst};
$self->{'ring_ent_subst'} = {};
}

sub XP1 # xp1 processor, parse only, non-edit
{
my ($self, $BUFFERED, $rpl_mk) = @_;
my ($markup_file);
my $parse_ln = '';
my $dyna_ln = '';
my $ref_parse_ln = \$parse_ln;
my $ref_dyna_ln = \$dyna_ln;
if ($BUFFERED) {
$ref_parse_ln = $rpl_mk;
$ref_dyna_ln = \$dyna_ln;
} else {
# assume its a ref to a global or global itself
$markup_file = $rpl_mk;
$ref_dyna_ln = $ref_parse_ln;
}
my $ln_cnt = 0;
my $complete_comment = 0;
my $complete_cdata = 0;
my @Tags = ();
my $havroot = 0;
my $last_cpos = 0;
my $done = 0;
my $content = '';
my $altcontent = undef;

$self->{'origcontent'} = \$content;

while (!$done)
{
$ln_cnt++;

# stream processing (if not buffered)
if (!$BUFFERED) {
if (!($_ = <$markup_file>)) {
# just parse what we have
$done = 1;
# boundry check for runnaway
if (($complete_comment+$complete_cdata) > 0) {
$ln_cnt--;
}
} else {
$$ref_parse_ln .= $_;

## buffer if needing comment/cdata closure
next if ($complete_comment && !/-->/);
next if ($complete_cdata && !/\]\]>/);

## reset comment/cdata flags
$complete_comment = 0;
$complete_cdata = 0;

## flag serialized comments/cdata buffering
if (/(<!--)|(<!\[CDATA\[)/)
{
if (defined $1) { # complete comment
if ($$ref_parse_ln !~ /<!--.*?-->/s) {
$complete_comment = 1;
next;
}
}
elsif (defined $2) { # complete cdata
if ($$ref_parse_ln !~ /<!\[CDATA\[.*?\]\]>/s) {
$complete_cdata = 1;
next;
}
}
}
## buffer until '>' or eof
next if (!/>/);
}
} else {
$ln_cnt = 1;
$done = 1;
}

## REGEX Parsing loop
while ($$ref_parse_ln =~ /$RxParseXP1/g)
{
## handle contents
if (defined $14) {
$content .= $14;
$last_cpos = pos($$ref_parse_ln);
next;
}
## valid content here ... can be taken off
print "-"x20,"\n" if ($self->{'debug'});
if (length ($content)) {
## check reserved characters in content
if ($content =~ /[<>]/) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
## mark-up characters in content
throwX($self, 'OVR', '01', $content, $ref_parse_ln, $last_cpos, $ln_cnt);
}
if (!scalar(@Tags)) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
if ($content =~ /[^\s]/s) {
## content at root level
throwX($self, 'OVR', '02', $content, $ref_parse_ln, $last_cpos, $ln_cnt);
}
}
# substitute special xml characters, then call content handler with $content
# ------------------------------------------------------
# $content has to be a constant if xml reserved chars
# are found, copy altered string to pass to handler
# otherwise pass original $content
# ------------------------------------------------------
if (defined ($altcontent = convertEntities ($self, \$content))) {
$self->{'hchar'}($self, $$altcontent);
} else {
$self->{'hchar'}($self, $content);
}
#print "14 $content\n" if ($self->{'debug'});
#print "-"x20,"\n" if ($self->{'debug'});
$content = '';
}
#if ($show_pos && $debug) {
# my $rr = pos $$ref_parse_ln;
# print "$rr ";
#}

## <tag> or </tag> or <tag/>
if (defined $2)
{
my ($l1,$l3) = (length($1),length($3));
if (($l1+$l3)==0) { ## <tag>
if (!scalar(@Tags) && $havroot) {
## new root node <tag>
throwX($self, 'OVR', '03', $2, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
push @Tags,$2;
$havroot = 1;
# call start tag handler with $2
$self->{'hstart'}($self, $2);
}
elsif ($l1==1 && $l3==0) { ## </tag>
my $pval = pop @Tags;
if (!defined $pval) {
## missing start tag </tag>
throwX($self, 'OVR', '04', $2, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
elsif ($2 ne $pval) {
## expected closing tag </tag>
throwX($self, 'OVR', '05', $pval, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
# call end tag handler with $2
$self->{'hend'}($self, $2);
}
elsif ($l1==0 && $l3==1) { ## <tag/>
if (!scalar(@Tags) && $havroot) {
## new root node <tag/>
throwX($self, 'OVR', '06', $2, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
$havroot = 1; # first and only <root/>
# call start tag handler, then end tag handler, with $2
$self->{'hstart'}($self, $2);
$self->{'hend'}($self, $2);
} else {
## <//node//> errors
## hard error, just report
throwX($self, 'HARD', '07', "$1$2$3", $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
next;
}
#print "2 TAG: $1$2$3\n" if ($self->{'debug'});
}
## <tag attrib/> or <tag attrib>
elsif (defined $5)
{
my $l7 = length($7);

## attributes
my $attref = getAttrARRAY($self, $6);
unless (ref($attref)) {
## missing or extra token
## hard error, just report
throwX($self, 'HARD', '08', $attref, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
next;
}
if ($l7==0) { ## <tag attrib>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX($self, 'OVR', '03', $5, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
push @Tags,$5;
$havroot = 1;
# call start tag handler with $5 and attributes @{$attref}
$self->{'hstart'}($self, $5, @{$attref});
}
elsif ($l7==1) { ## <tag attrib/>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX($self, 'OVR', '06', $7, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
$havroot = 1; # first and only <root attrib/>
# call start tag handler with $5 and attributes @{$attref}, then end tag handler with $5
$self->{'hstart'}($self, $5, @{$attref});
$self->{'hend'}($self, $5);
} else {
## syntax error
## hard error, just report
throwX($self, 'HARD', '07', "$5$6$7", $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
next;
}
#if ($self->{'debug'}) {
# print "5,6 TAG: $5 Attr: $6$7\n" ;
#}
}
## XMLDECL or PI (processing instruction)
elsif (defined $8)
{
my $pi = $8;
# xml declaration ?
if ($pi =~ /^xml(.*?)$/) {
my $attref = getAttrARRAY($self, $1);
unless (ref($attref)) {
## missing or extra token in xmldecl
## hard error, just report
throwX($self, 'HARD', '14', $attref, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
next;
}
#if (!scalar(@{$attref})) {
# ## missing xmldecl parameters
# throwX($self, 'OVR', '15', $pi, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
#}
my ($version,$encoding,$standalone);
while (my ($name,$val) = splice (@{$attref}, 0, 2)) {
if ('version' eq lc($name) && !defined $version) {
if ($val !~ /^[0-9]\.[0-9]+$/) {
## invalid version character data in xmldecl
throwX($self, 'OVR', '16', "$name = '$val'", $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
$version = $val;
} elsif ('encoding' eq lc($name) && !defined $encoding) {
if ($val !~ /^[A-Za-z][\w\.-]*$/) {
## invalid encoding character data in xmldecl
throwX($self, 'OVR', '17', "$name = '$val'", $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
$encoding = $val;
} elsif ('standalone' eq lc($name) && !defined $standalone) {
if ($val !~ /^(?:yes)|(?:no)$/) {
## invalid standalone character data in xmldecl
throwX($self, 'OVR', '18', "$name = '$val'", $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
$standalone = ($val eq 'yes' ? 1 : 0);
} else {
## unknown xmldecl parameter
throwX($self, 'OVR', '19', "$name = '$val'", $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
}
if (!defined $version) {
# missing version in xmldecl
## hard error, just report
throwX($self, 'HARD', '20', $pi, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
next;
}
# call xmldecl handler
$self->{'hxmldecl'}($self, $version, $encoding, $standalone);
}
# PI - processing instruction
elsif ($pi =~ /$RxPi/) {
# call pi handler
$self->{'hproc'}($self, $1, $2);
} else {
# unknown PI data
throwX($self, 'HARD', '21', $pi, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
next;
}
#print "8 VERSION: $8\n" if ($self->{'debug'});
}
## META
elsif (defined $4) {
# If doctype is HTML then META is not closed
# parse meta data, call handler
$self->{'hmeta'}($self, $4);
#print "4 META: $4\n" if ($self->{'debug'});
}
## DOCTYPE
elsif (defined $9) {
# parese doctype, call handler
$self->{'hdoctype'}($self, $9);
#print "9 DOCTYPE: $9\n" if ($self->{'debug'});
}
## CDATA
elsif (defined $10) {
if (!scalar(@Tags)) {
## CDATA content at root
throwX($self, 'OVR', '09', $10, $ref_parse_ln, pos($$ref_parse_ln), $ln_cnt);
}
# call cdata handler
$self->{'hcdata'}($self, $10);
#print "10 CDATA: $10\n" if ($self->{'debug'});
}
## COMMENT
elsif (defined $11) {
# call comment handler
$self->{'hcomment'}($self, $11);
#print "11 COMMENT: $11\n" if ($self->{'debug'});
}
## ATTLIST
elsif (defined $12) {
# parese attlist, call handler
$self->{'hattlist'}($self, $12);
#print "12 ATTLIST: $12\n" if ($self->{'debug'});
}
## ENTITY
elsif (defined $13) {
# parese entity, call handler
my ($entdata, $entdata_added, $entname) = ($13, undef, '');
if ($entdata =~ /$RxENTITY/) {
if (defined $1) {
# general entity replacement
$entdata_added = addEntity($self, 0, $1, $3);
$entname = "&$1";
} else {
# parameter entity replacement
$entdata_added = addEntity($self, 1, $2, $3);
$entname = "&$2";
}
}
else {
# unknown ENTITY data
#
}
if (defined $entdata_added) {
$self->{'hentity'}($self, $entname, $$entdata_added);
} else {
$self->{'hentity'}($self, $entname, $entdata);
}
#print "13 ENTITY: $13\n" if ($self->{'debug'});
}
}
$$ref_dyna_ln = $content;
$content = '';
}
if (!$havroot) {
# not valid xml
throwX($self, 'OVR', '10');
}
if (scalar(@Tags)) {
my $str = '';
while (defined (my $etag = pop @Tags)) {
$str .= ", /$etag";
}
$str =~ s/^, +//;
# missing end tag
throwX($self, 'OVR', '11', $str);
}
if ($$ref_dyna_ln =~ /[^\s]/s) {
if ($$ref_dyna_ln =~ /[<>]/) {
# mark-up characters in content
throwX($self, 'OVR', '12', $$ref_dyna_ln);
} else {
# content at root level (end)
throwX($self, 'OVR', '13', $$ref_dyna_ln);
}
}
$self->{'origcontent'} = undef;
return 1;
}

sub getAttrARRAY
{
my ($self, $attrstr) = @_;
my $aref = [];
my ($alt_attval, $attval, $rx);

while ($attrstr =~ s/$RxAttr//) {
push @{$aref},$1;
if ($2 eq "'") {
$rx = \$RxAttr_DL1;
} else {
$rx = \$RxAttr_DL2;
}
if ($attrstr =~ s/$$rx//) {
if (defined $1) {
push @{$aref},$1;
next;
}
$attval = $2;
if (defined ($alt_attval = convertEntities ($self, \$attval))) {
push @{$aref},$$alt_attval;
next;
}
push @{$aref},$attval;
next;
}
return $attrstr;
}
if ($attrstr=~/$RxAttr_RM/) {
$attrstr =~ s/^\s+//s; $attrstr =~ s/\s+$//s;
return $attrstr if (length($attrstr));
}
return $aref;
}

sub convertEntities
{
my ($self, $str_ref, $opts) = @_;
my $alt_str = '';
my $res = 0;
my ($entchr);

# Usage info:
# Option bitmask: 1=char reference, 2=general reference, 4=parameter reference
# Default option is char and general references (&)
# Ignore Parameter references (%) in Attvalue and Content
# Process PE's in DTD and Entity decls

$opts = 3 unless defined $opts;

while ($$str_ref =~ /$self->{'RxEntConv'}/gc)
{
# Unicode character reference
if (defined $4) {
# decimal
if (($opts & 1) && defined ($entchr = getEntityUchar($self, $4))) {
$alt_str .= "$1$entchr";
$res = 1;
} else {
$alt_str .= "$1$2#$4;";
}
} elsif (defined $5) {
# hex
if (($opts & 1) && length($5) < 9 && defined ($entchr = getEntityUchar($self, hex($5)))) {
$alt_str .= "$1$entchr";
$res = 1;
} else {
$alt_str .= "$1$2#$5;";
}
}
else {
# General reference
if ($2 eq '&') {
if (($opts & 2) && exists $self->{'general_ent_subst'}->{$3}) {
$alt_str .= $1;

# expand general references,
# bypass if seen in the recursion ring
# ----
if (defined $self->{'ring_ent_subst'}->{$3}) {
$alt_str .= "$1$2$3;";
} else {
# recurse expansion
# ----
my ($entname, $alt_entval) = ($3, undef);
my $entval = $self->{'general_ent_subst'}->{$entname};
$self->{'ring_ent_subst'}->{$entname} = 1;

if (defined ($alt_entval = convertEntities ($self, \$entval, 2))) {
$alt_str .= $$alt_entval;
} else {
$alt_str .= $self->{'general_ent_subst'}->{$entname};
}
$self->{'ring_ent_subst'}->{$entname} = undef;
$res = 1;
}
} else {
$alt_str .= "$1$2$3;";
}
} else {
# Parameter reference
if (($opts & 4) && exists $self->{'parameter_ent_subst'}->{$3}) {
$alt_str .= "$1$self->{'parameter_ent_subst'}->{$3}";
$res = 1;
} else {
$alt_str .= "$1$2$3;";
}
}
}
}
if ($res) {
$alt_str .= substr $$str_ref, pos($$str_ref);
return \$alt_str;
}
return undef;
}

sub getEntityUchar
{
my ($self, $code) = @_;
if (($code >= 0x01 && $code <= 0xD7FF) ||
($code >= 0xE000 && $code <= 0xFFFD) ||
($code >= 0x10000 && $code <= 0x10FFFF)) {
return chr($code);
}
return undef;
}

sub addEntity
{
my ($self, $peflag, $entname, $entval) = @_;

# Non-normalized, internal entities only
# (no external defs yet, ie:SYSTEM/PUBLIC/NDATA)
return undef unless
($entval =~ s/^\s*'([^']*?)'\s*$/$1/s || $entval =~ s/^\s*"([^"]*?)"\s*$/$1/s);

# Replacement text: convert parameter and character references only
my ($alt_entval);
if (defined ($alt_entval = convertEntities ($self, \$entval, 5))) {
$entval = $$alt_entval;
}
my $enttype = 'general_ent_subst';
$enttype = 'parameter_ent_subst' if ($peflag);

if (exists $self->{'$enttype'}->{$entname}) {
# warn, pre-existing ent name
return undef;
}
$self->{$enttype}->{$entname} = $entval;
$self->{'Entities'} .= "|(?:$entname)";
# recompile regexp
$self->{'RxEntConv'} = qr/(.*?)(&|%)($self->{'Entities'});/s;
return \$entval;
}


# default handlers
# ------------------
sub dflt_start {
my ($self, $el, @attr) = @_;
if ($self->{'debug'}) {
print "start _: $el\n";
while (my ($name,$val) = splice (@attr, 0,2)) {
print " "x12,"$name = $val\n";
}
}
}
sub dflt_char {
my ($self, $str) = @_;
if ($self->{'debug'}) {
print "char _: $str\n";
print "-"x20,"\n";
}
}
sub dflt_end {my ($self, $el) = @_;print "end _: /$el\n" if ($self->{'debug'});}
sub dflt_cdata {my ($self, $str) = @_;print "cdata _: $str\n" if ($self->{'debug'});}
sub dflt_comment {my ($self, $str) = @_;print "comnt _: $str\n" if ($self->{'debug'});}
sub dflt_meta {my ($self, $str) = @_;print "meta _: $str\n" if ($self->{'debug'});}
sub dflt_attlist {my ($self, $parm) = @_;print "attlist_h _: $parm\n" if ($self->{'debug'});}
sub dflt_doctype {my ($self, $parm) = @_;print "doctype_h _: $parm\n" if ($self->{'debug'});}
sub dflt_element {my ($self, $parm) = @_;print "element_h _: $parm\n" if ($self->{'debug'});}

sub dflt_entity {
my ($self, $entname, $entval) = @_;
if ($self->{'debug'}) {
print "entity_h _: $entname = $entval\n";
}
}

sub dflt_xmldecl {
my ($self, $version, $encoding, $standalone) = @_;

if ($self->{'debug'}) {
print "xmldecl_h _: version = $version\n" if (defined $encoding);
print " "x14,"encoding = $encoding\n" if (defined $encoding);
print " "x14,"standalone = $standalone\n" if (defined $standalone);
}
}
sub dflt_proc {
my ($self, $target, $data) = @_;

if ($self->{'debug'}) {
print "proc_h _: target = $target\n";
print " "x14,"data = $data\n";
}
}
sub dflt_error {my ($self, $errlvl, $errno, $estr, $estr_basic) = @_;print "$estr\n" if ($self->{'debug'});}



# ======================
# RXParse global init
# ======================
sub InitParser
{
%Dflth = (
'hstart' => \&dflt_start,
'hend' => \&dflt_end,
'hchar' => \&dflt_char,
'hcdata' => \&dflt_cdata,
'hcomment' => \&dflt_comment,
'hmeta' => \&dflt_meta,
'hattlist' => \&dflt_attlist,
'hentity' => \&dflt_entity,
'hdoctype' => \&dflt_doctype,
'helement' => \&dflt_element,
'hxmldecl' => \&dflt_xmldecl,
'hproc' => \&dflt_proc,
'herror' => \&dflt_error,
);
@UC_Nstart = (
"\\x{C0}-\\x{D6}",
"\\x{D8}-\\x{F6}",
"\\x{F8}-\\x{2FF}",
"\\x{370}-\\x{37D}",
"\\x{37F}-\\x{1FFF}",
"\\x{200C}-\\x{200D}",
"\\x{2070}-\\x{218F}",
"\\x{2C00}-\\x{2FEF}",
"\\x{3001}-\\x{D7FF}",
"\\x{F900}-\\x{FDCF}",
"\\x{FDF0}-\\x{FFFD}",
"\\x{10000}-\\x{EFFFF}",
);
@UC_Nchar = (
"\\x{B7}",
"\\x{0300}-\\x{036F}",
"\\x{203F}-\\x{2040}",
);
$Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
$Nchar = "[-\\w:\\.".join ('',@UC_Nchar).join ('',@UC_Nstart)."]";
$Name = "(?:$Nstrt$Nchar*?)";
#die "$Name\n";

$RxParseXP1 =
qr/(?:<(?:(?:(\/*)($Name)\s*(\/*))|(?:META(.*?))|(?:($Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(?:(?:DOCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?)))))>)|(.+?)/s;
# ( <( ( 1 12 2 3 3)|( 4 4)|( 5 56( ) 6 7 7)|( 8 8 )|( !( ( 9 9)|( 0 0 )|( 1 1 )|(
2 2)|( 3 3))))>)|4 4

$RxAttr = qr/^\s+($Name)\s*=\s*("|')/;

$RxAttr_DL1 = qr/^(?:([^'&]*?)|([^']*?))'/;
$RxAttr_DL2 = qr/^(?:([^"&]*?)|([^"]*?))"/;
$RxAttr_RM = qr/[^\s\n]+/;
$RxPi = qr/^($Name)\s+(.*?)$/s;

#[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
#[53] AttDef ::= S Name S AttType S DefaultDecl

$RxENTITY = qr/^\s+(?:($Name)|(?:%\s+($Name)))\s+(.*?)$/s;
# ( 1 1|( 2 2)) 3 3
%dflt_general_ent_subst = (
'amp' =>'&',
'gt' =>'>',
'lt' =>'<',
'apos'=>"'",
'quot'=>"\""
);
%dflt_parameter_ent_subst = ();

%ErrMsg = (
'01' => "\"mark-up or reserved characters in content (line %s, col %s), malformed element? '%s'\", \$line, \$col, \$datastr",
'02' => "\"content at root level (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'03' => "\"element wants to be new root node (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'04' => "\"missing start tag for '/%s' (line %s, col %s)\", \$datastr, \$line, \$col",
'05' => "\"expected closing tag '/%s' (line %s, col %s)\", \$datastr, \$line, \$col",
'06' => "\"element wants to be new root node (line %s, col %s): '%s/'\", \$line, \$col, \$datastr",
'07' => "\"tag syntax '%s' (line %s, col %s)\", \$datastr, \$line, \$col",
'08' => "\"invalid, missing or extra tokens in attribute asignment (line %s, col %s): %s\", \$line, \$col, \$datastr",
'09' => "\"CDATA content at root level (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'10' => "\"not a valid xml document\"",
'11' => "\"missing end tag '%s'\", \$datastr",
'12' => "\"mark-up or reserved characters in content (end), malformed element? '%s'\", \$datastr",
'13' => "\"content at root level (end): '%s'\", \$datastr",
'14' => "\"invalid, missing or extra tokens in xmldecl asignment (line %s, col %s): %s\", \$line, \$col, \$datastr",
'15' => "\"missing xmldecl parameters (line %s, col %s): %s\", \$line, \$col, \$datastr",
'16' => "\"invalid 'version' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
'17' => "\"invalid 'encoding' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
'18' => "\"invalid 'standalone' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
'19' => "\"unknown xmldecl parameter (line %s, col %s): %s\", \$line, \$col, \$datastr",
'20' => "\"missing xmldecl 'version' parameter (line %s, col %s): %s\", \$line, \$col, \$datastr",
'21' => "\"unknown or missing processing instruction parameters (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'30' => "\"already in parse\"",
'31' => "\"data source not defined\"",
'32' => "\"handler '%s' is not a CODE reference\", \$datastr",
'33' => "\"data source not string or filehandle, nor reference to one\"",
);
}

sub throwX
{
my ($self, $errlvl, $errno, $datastr, $lrefseg, $cseg_err, $l_tot) = @_;
my ($line, $col, $estr, $estr_basic) = (0,0,'','');
if (defined $lrefseg) {
($line,$col) = getRealColumn($lrefseg, $l_tot, $cseg_err);
}
die "No such error message ($errno)\n" if (!exists $ErrMsg{$errno});

my $ctmpl = "\$estr_basic = sprintf ($ErrMsg{$errno});";
eval $ctmpl;
$estr = "rp_error_$errno, $estr_basic";

# call error handler
$self->{'herror'}($self, $errlvl, $errno, $estr, $estr_basic);

if ($errlvl eq 'FATAL') {
Cleanup($self); croak $estr."\n";
}
elsif (!$self->{'ignore_errors'} && ($errlvl eq 'HARD' || $errlvl eq 'OVR')) {
Cleanup($self); croak $estr."\n";
}
}

sub getRealColumn
{
my ($lrefseg, $l_tot, $cseg_err) = @_;
my $cseg_offset = 0;
my $save_pos = pos($$lrefseg);
pos($$lrefseg) = 0;
my ($lseg_tot, $lseg_offset) = (0,1);

while ($$lrefseg =~ /\n/g) {
$lseg_tot++;
if (pos($$lrefseg) < $cseg_err) {
$cseg_offset = pos($$lrefseg);
$lseg_offset++;
next;
}
if ($l_tot <= 1) {
$lseg_tot = $l_tot;
last;
}
}
pos($$lrefseg) = $save_pos;
return ($l_tot-$lseg_tot+$lseg_offset, $cseg_err-$cseg_offset);
}



1;

__END__


.



Relevant Pages

  • RE: AddHandler
    ... long as you need your event handler to run. ... If you only use a local variable to hold a reference to the text box, ... handler will stop to work after the garbage collector sweeps it away. ... Private Sub ThisDocument_OpenHandles ThisDocument.Open ...
    (microsoft.public.vsnet.vstools.office)
  • Re: Similar to SPLIT
    ... Both test examples print the first and last character. ... Other speed-ups would be to set a vba library reference, ... Sub TestIt() ... Dim v As Variant ...
    (microsoft.public.excel.programming)
  • RXParse module v.90 (by robic0)
    ... When I release version 1 of RXParse, I anticipate that I will provide a ftp site ... sub original_content ... then call content handler with $content ... # call start tag handler with $2 ...
    (comp.lang.perl.misc)
  • CRAP CODE CHRONICLES: Xml
    ... this will be painfull for all the XML experts here on this board. ... sub original_content ... # call new_parse handler ... # call start tag handler with $2 ...
    (comp.lang.perl.misc)
  • RXParse 1.2
    ... this will not be the focus for RXParse. ... Version 1.4 will contain the XP3 engine code to do inline replacement within the SAX handler. ... sub original_content ... # call start tag handler with $2 ...
    (comp.lang.perl.misc)