Re: My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)
- From: robic0
- Date: Thu, 22 Dec 2005 22:42:08 -0800
On Tue, 20 Dec 2005 23:59:06 -0800, robic0 wrote:
>This post is in response to someone who asked for help trying to
>parse xml into a data structure.
This will fix the final issues with "ForceArray".
Comments have an issue with enclosed "<" or ">" in this
version, other than that they will process normally.
Its a regex issue (shortcoming in my opinion) that can't
match a "not" string. Where I need <!--(all but "<!--")-->.
Where (.*)(?!<!--) won't work in an expression. But I'll
work around that.
This is version .901 from 12-22-05 is the one you want.
This is close to the last post as far as this newsgroup.
Sorry, but I had to get it stable. I've run this on every
big and wierd xml file I could get my hands on. I'm
satisfied with it.
See ya...
print <<EOM;
# XML Regex Parser
# Version .901 - 12/22/05
# Copyright 2005,
# by robic0-At-yahoo.com
# -----------------------
EOM
use strict;
use warnings;
use Data::Dumper;
#open DATA, "sumfile.xml" or die "can't open datafile...";
#my $gabage1 = join ('', <DATA>);
#close DATA;
my $gabage3 = '
<big name="asdf" date="33" >
asdf
<in1>
<!-- howdy f*%$olks -->
<in2>jjjj</in2>
<small biz="wefwf" ueue = "second" />
<!-- and still more -->
<bar><inside>asgfasdf<insF>2</insF>sdfb</inside></bar>
</in1>
<in2>some in3 content</in2>
asdfb
</big>
';
my @xml_strings = ($gabage3);
my $VERSION = .901;
my $debug = 1;
my $rmv_white_space = 1;
my $ForceArray = 0;
my $KeepRoot = 0;
my $KeepComments = 0;
## -- XML, start & end regexp substitution delimiter chars --
## match side , substitution side
## ----------------------/-------------------------------
my @S_dlim = ('\[' , '['); # use these for debug
my @E_dlim = ('\]' , ']');
#my @S_dlim = (chr(140) , chr(140)); # use these for production
#my @E_dlim = (chr(141) , chr(141));
## -- Process xml data --
##
for (@xml_strings)
{
print "\n",'='x30,"\n$_\n\n";
if ($rmv_white_space) {
s/>[\s]+</></g;
s/[\s]+</</g;
s/>[\s]+/>/g;
}
my $ROOT = {}; # container
my ($last_cnt, $cnt, $i) = (-1, 1, 0);
# should only need 2 iterations max, but wth
while ($cnt != $last_cnt && $i < 20)
{
$last_cnt = $cnt;
## <?XML-Version ?> , have to check the format of '<?'
while (s/<\?([^<>]*)\?>//i) {} # to void xml
versioning
# while (s/<\?([^<>]*)\?>/$S_dlim[1]$cnt$E_dlim[1]/i)
{ print "$cnt <$1> = \n" if ($debug); $cnt++}
## <!-- Comments -->, nesting not processed,
## also comments can't have "<" or ">" this version.
if (!$KeepComments) {
while (s/<!--[^<>]*-->//s) {} # to void
comments
} else {
while
(s/<!--([^<>]*)-->/$S_dlim[1]$cnt$E_dlim[1]/s) {
# while
(s/<!--([\w\s]*)(?!<!--)-->/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <!-- --> = $1\n" if
($debug);
$ROOT->{$cnt} = { comment => $1 };
$cnt++;
}
}
## <Tag/> , no content
while
(s/<([0-9a-zA-Z]+)\/>/$S_dlim[1]$cnt$E_dlim[1]/i) {
print "$cnt <$1> = \n" if ($debug);
$ROOT->{$cnt} = { $1 => '' };
$cnt++;
}
## <Tag Attributes/> , no content
while (s/<([0-9a-zA-Z]+)([ ]+[0-9a-zA-Z]+[ ]*=[
]*"[^<]*")+[ ]*\/>/$S_dlim[1]$cnt$E_dlim[1]/i) {
print "$cnt <$1> = attr: $2\n" if ($debug);
$ROOT->{$cnt} = { $1 => getAttrHash($2) };
$cnt++;
}
## <Tag> Content </Tag>
while
(s/<([0-9a-zA-Z]+)>([^<]*)<\/\1>/$S_dlim[1]$cnt$E_dlim[1]/i) {
print "$cnt <$1> = $2\n" if ($debug);
my $unknown = '';
if (length($2) > 0) {
my $hcontent = getContentHash($2,
$ROOT);
$unknown = $hcontent;
if (keys (%{$hcontent}) > 1) {
if (!$ForceArray) {
adjustForSingleItemArrays ($hcontent); }
} else {
if (exists
$hcontent->{'content'} && scalar(@{$hcontent->{'content'}}) == 1) {
if (!$ForceArray ) {
$unknown =
${$hcontent->{'content'}}[0];
} else {$unknown =
$hcontent->{'content'}; }
}
if (!$ForceArray) {
adjustForSingleItemArrays ($hcontent); }
}
}
$ROOT->{$cnt} = { $1 => $unknown };
$cnt++;
}
## <Tag Attributes> Content </Tag>
while (s/<([0-9a-zA-Z]+)([ ]+[0-9a-zA-Z]+[ ]*=[
]*"[^<]*")+[ ]*>([^<]*)<\/\1>/$S_dlim[1]$cnt$E_dlim[1]/i) {
print "$cnt <$1> = attr: $2, content: $3\n" if
($debug);
my $hattrib = getAttrHash($2);
if (length($3) > 0) {
my $hcontent = getContentHash($3,
$ROOT);
if (!$ForceArray) {
adjustForSingleItemArrays ($hcontent); }
while (my ($key,$val) = each
(%{$hcontent})) {
$hattrib->{$key} = $val;
}
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
if ($last_cnt != $cnt) {
$i++ ; print "** End pass $i\n" if ($debug);
}
}
if (/<|>/) {
print "($i) XML problem: malformed, syntax or tag
closure:\n$_";
} else {
print "\n** Itterations = $i\n** ForceArray =
$ForceArray\n** KeepRoot = $KeepRoot\n** KeepComments =
$KeepComments\n\n";
#print Dumper($ROOT);
my $outer_element = $cnt-1;
if (exists $ROOT->{$outer_element}) {
my $htodump = $ROOT->{$outer_element};
if (!$KeepRoot && keys (%{$htodump}) == 1) {
my ($key,$val) = each (%{$htodump});
$htodump = $val;
}
my $tmp = {};
%{$tmp} = %{$htodump};
print Dumper($tmp);
} else {print "nothing to output!\n";}
}
}
##
sub adjustForSingleItemArrays
{
my $href = shift;
## if $val is an array ref and has one element
## set $href->{$key} equal to the element
while (my ($key,$val) = each (%{$href})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1) {
$href->{$key} = $val->[0];
}
}
}
}
##
sub getAttrHash
{
my $attstr = shift;
my $ahref = {};
return $ahref unless (defined $attstr);
while ($attstr =~ s/[ ]*([0-9a-zA-Z]+)[ ]*=[ ]*"([^=]*)"[
]*//i) {
$ahref->{$1} = $2;
}
return $ahref;
}
##
sub getContentHash
{
my ($attstr,$hStore) = @_;
my $ahref = {};
return $ahref unless (defined $attstr && defined $hStore);
my @ary = ();
while ($attstr =~
s/([^<$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//i) {
if (defined $1) {
push (@ary, $1);
}
elsif (defined $2 && exists $hStore->{$2}) {
my ($key,$val) = each (%{$hStore->{$2}});
if (exists $ahref->{$key}) {
push (@{$ahref->{$key}}, $val);
} else {
$ahref->{$key} = [$val];
}
}
}
if (scalar(@ary) > 0) { $ahref->{'content'} = [@ary]; }
## if $val is an array ref and has one element and it
## is a hash ref, set {$key} equal to hash ref
if (!$ForceArray) {
while (my ($key,$val) = each (%{$ahref})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1 &&
ref($val->[0]) eq "HASH") {
$ahref->{$key} = $val->[0];
}
}
}
}
return $ahref;
}
__END__
# XML Regex Parser
# Version .901 - 12/22/05
# Copyright 2005,
# by robic0-At-yahoo.com
# -----------------------
==============================
<big name="asdf" date="33" >
asdf
<in1>
<!-- howdy f*%$olks -->
<in2>jjjj</in2>
<small biz="wefwf" ueue = "second" />
<!-- and still more -->
<bar><inside>asgfasdf<insF>2</insF>sdfb</inside></bar>
</in1>
<in2>some in3 content</in2>
asdfb
</big>
1 <small> = attr: biz="wefwf" ueue = "second"
2 <in2> = jjjj
3 <insF> = 2
4 <inside> = asgfasdf[3]sdfb
5 <bar> = [4]
6 <in1> = [2][1][5]
7 <in2> = some in3 content
8 <big> = attr: name="asdf" date="33", content: asdf[6][7]asdfb
** End pass 1
** Itterations = 1
** ForceArray = 0
** KeepRoot = 0
** KeepComments = 0
$VAR1 = {
'in2' => 'some in3 content',
'date' => '33',
'name' => 'asdf',
'content' => [
'asdf',
'asdfb'
],
'in1' => {
'small' => {
'ueue' => 'second',
'biz' => 'wefwf'
},
'bar' => {
'inside' => {
'insF' => '2',
'content' => [
'asgfasdf',
'sdfb'
]
}
},
'in2' => 'jjjj'
}
};
.
- Follow-Ups:
- Re: My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)
- From: Bart Van der Donck
- Re: My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)
- From: Tad McClellan
- Re: My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)
- References:
- Prev by Date: Net::SSH::Perl on tru64
- Next by Date: Re: Exiting without printing
- Previous by thread: Re: My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)
- Next by thread: Re: My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)
- Index(es):
Relevant Pages
|