subclassing HTML::Parser

From: Andrew Gaffney (agaffney_at_skylineaero.com)
Date: 07/30/04


Date: Thu, 29 Jul 2004 19:53:12 -0500
To: beginners <beginners@perl.org>

I've created a module that uses HTML::Parser to parse some HTML and create a
tree structure. Someone had suggested to use HTML::TreeBuilder, but my HTML
contains HTML::Mason code embedded, and HTML::TreeBuilder doesn't handle that
well at all. HTML::TreeBuilder also adds <body>, <head>, and <html> tags when
there aren't any in the document it is parsing. The files I'm using this with
are only parts of HTML pages, so I don't want that stuff added.

My module works well enough, but I'm getting to the point where I need multiple
parse trees existing at the same time in a mod_perl environment. The way my
module is now, they could get mixed up, because I can't find a way to pass a
custom variable to the event handler subroutines of HTML::Parser.

I've figured that if I subclass it, I can create a new object for each parse
tree instead of just returning an array reference. Here is my current code:

package SkylineEdit;

use HTML::Parser ();

@ISA = ('Exporter');
@EXPORT = ('html_to_htmltree', 'htmltree_to_html', 'get_node_content',
'set_node_content');

my $htmltree;
my $node;
my @prevnodes;
my $htmloutput;

sub start {
   my $tagname = shift;
   my $attr = shift;
   my $newnode = {};

   $newnode->{tag} = $tagname;
   foreach my $key(keys %{$attr}) {
     $newnode->{$key} = $attr->{$key};
   }
   $newnode->{content} = [];
   push @prevnodes, $node;
   push @{$node}, $newnode;
   $node = $newnode->{content};
}

sub end {
   my $tagname = shift;

   $node = pop @prevnodes;
}

sub text {
   my $text = shift;

   chomp $text;
# $text =~ s/(^\n|\n$)//gs;
   if($text ne '') {
     push @{$node}, $text;
   }
}

sub set_node_content {
   my $htmltree = shift;
   my $node = shift;
   my $content = shift;
   my $tmpnode = $htmltree->[0];

   $node =~ s/^\d+\.//;
   while($node =~ /(\d+)\.?/cg) {
     $tmpnode = $tmpnode->{content}->[$1];
   }
   $tmpnode->{content} = [$content];

   return $htmltree;
}

sub get_node_content {
   my $htmltree = shift;
   my $node = shift;
   my $levels = shift || 0;
   my $tmpnode = $htmltree->[0];

   $node =~ s/^\d+\.//;
   while($node =~ /(\d+)\.?/cg) {
     $tmpnode = $tmpnode->{content}->[$1];
   }
   descend_htmltree($tmpnode->{content}, 0, "");

   return $htmloutput;
}

sub descend_htmltree {
   my $node = shift;
   my $withclickiness = shift;
   my $node_id = shift;
   my $colors = { td => '#ff0000', p => '#aaaaaa', table => '#ff0000' };

   my $node_counter = 0;
   foreach my $tmpnode (@{$node}) {
     if(ref($tmpnode) eq 'HASH') {
       my $nodeid = "${node_id}.${node_counter}";
       $htmloutput .= "<div style='border: thin solid " .
$colors->{$tmpnode->{tag}} . "; margin: 1px 1px 1px 1px'>" if($withclickiness &&
$tmpnode->{tag} eq 'table');
       $htmloutput .= "<$tmpnode->{tag}";
       foreach(keys %{$tmpnode}) {
         $htmloutput .= " $_=\"$tmpnode->{$_}\"" if($_ ne 'tag' && $_ ne 'content');
       }
       $htmloutput .= ">";
       $htmloutput .= "<div style='padding: 1px 1px 1px 1px; border: thin solid
" . $colors->{$tmpnode->{tag}} . "; margin: 1px 1px 1px 1px'
onDblClick=\"parent.location =
'/editor/editfile.html?action=edittext&node=${nodeid}&tmpfile='+tmpfile+'&filename='+filename\">"
if($withclickiness && ($tmpnode->{tag} eq 'p' || $tmpnode->{tag} eq 'td'));
       descend_htmltree($tmpnode->{content}, $withclickiness, $nodeid);
       $htmloutput .= "</div>" if($withclickiness && ($tmpnode->{tag} eq 'p' ||
$tmpnode->{tag} eq 'td'));
       $htmloutput .= "</$tmpnode->{tag}>" if($tmpnode->{tag} ne 'br');
       $htmloutput .= "</div>" if($withclickiness && $tmpnode->{tag} eq 'table');
     } else {
# my $nodeid = "${node_id}.${node_counter}";
       $htmloutput .= "$tmpnode";
     }
     $node_counter++;
   }
}

sub htmltree_to_html {
   my $filename = shift || '';
   my $withclickiness = shift || 0;
   my $htmltree = shift;

   descend_htmltree($htmltree->[0]->{content}, $withclickiness, "0");
   if($filename ne '') {
     open HTML, "> $filename" or die "Can't open $filename for HTML output";
     print HTML $htmloutput;
     close HTML;
   }

   return $htmloutput;
}

sub html_to_htmltree {
   my $filename = shift;
   my $html = shift || '';
# my $rightpane = shift || 0;
# my $htmltree;

   $htmltree = [ { tag => 'document', content => [] } ];
   $node = $htmltree->[0]->{content};
   @prevnodes = ($htmltree);
   $htmloutput = "";
   my $p = HTML::Parser->new( api_version => 3,
                              start_h => [\&start, "tagname, attr"],
                              end_h => [\&end, "tagname"],
                              text_h => [\&text, "dtext"] );
   if($filename ne '') {
     open HTML, "< $filename" or die "Can't open input HTML file";
     $html = "";
     while(<HTML>) {
       $html .= $_;
     }
     close HTML;
# $html =~ s|(</?)%(\w+?>)|${1}_${2}|sg;
   }
   return undef if($html =~ /<\%\w+?>/s);
   $p->parse($html);
   $p->eof;

   return $htmltree;
}

1;

What changes do I need to make so that I can do something like the following?
Thanks for any help.

use SkylineEdit;

my $htmltree = SkylineEdit->new;
$htmltree->html_to_htmltree($somefile);

-- 
Andrew Gaffney
Network Administrator
Skyline Aeronautics, LLC.
636-357-1548