#!/usr/local/bin/perl
#
# Perl5 + p5-HTML-Parser 専用
# ul li /li /ul を処理して、閉じタグの省略をおぎなう。
# ol li /li /ol を処理して、閉じタグの省略をおぎなう。
# dl dt /dt dd /dd /dlを処理して、閉じタグの省略をおぎなう。
#
# 2001.11.02 first version
# 2001.11.09 ul / ol / dl のハードコーディングから、リスト形式のデータに変更。
#
# $Id: html-tag-ul3,v 1.2 2001/11/09 08:07:01 george Exp $
#
use HTML::Parser(); # use declare
#
# subclassing and method callbacks for HTML::Parser
#
{
package MyParser;
use base 'HTML::Parser';
my $dbg = 0; # デバッグ用フラグ
# 処理対象タグリスト指定
# 各リストの先頭1要素はブロック開始/終了タグ。
# 各リストの先頭2要素以降は内部用タグ。
@proc_tag_list = (
[ "ul" , "li" ] ,
[ "ol" , "li" ] ,
[ "dl" , "dt" , "dd"]
);
my $outbuf;
my @tag_stack;
sub bufprint {
my $s;
print "bufprint::start\n" if ($dbg > 5);
foreach $s (@_) {
$outbuf .= $s;
print "bufprint::" . $s . "\n" if ($dbg > 5);
}
print "bufprint:: outbuf string is " . $outbuf . "\n" if ($dbg > 5);
}
sub bufflush {
print "bufflush::start\n" if ($dbg);
print $outbuf;
$outbuf = "";
}
sub tag_stack_to_string {
my $aryref;
print "#tag_stack dump start.\n";
foreach $aryref (@tag_stack) {
foreach $s (@$aryref) {
print $s . " ";
}
print "\n";
}
print "#tag_stack dump end.\n";
}
sub bufappend {
my $s;
my $right_spc;
print "bufappend::start\n" if ($dbg);
# 右側の改行やスペースなどを分離する。
$right_spc = "";
if ( $outbuf =~ /(.*?)([\f\t\s\r\n]+)$/s ) {
$outbuf = $1;
$right_spc = $2;
print "bufappend:: left = :" . $outbuf . ": right = :" . $right_spc . ":\n" if ($dbg);
}
foreach $s (@_) { $outbuf .= $s; }
$outbuf .= $right_spc;
print "bufappend:: final outbuf = :" . $outbuf . ":\n" if ($dbg);
}
sub match_block_tag {
my ($s) = @_;
my $iref;
foreach $iref (@proc_tag_list) {
if ( $s eq $$iref[0] ) {
return 1;
}
}
return 0;
}
sub match_inner_tag {
my ($s) = @_;
my $iref;
my $irefin;
foreach $iref (@proc_tag_list) {
for ( $i = 1 ; $i <= $#$iref ; $i++ ) {
if ( $s eq $$iref[$i] ) {
return 1;
}
}
}
return 0;
}
sub check_inner_tag {
my ($intag, $blocktag) = @_;
my $iref;
my $irefin;
foreach $iref (@proc_tag_list) {
for ( $i = 1 ; $i <= $#$iref ; $i++ ) {
if ( $intag eq $$iref[$i] ) {
if ( $blocktag eq $$iref[0] ) {
return 1;
} else {
return 0;
}
}
}
}
return 0;
}
sub start {
my ($self, $tagname, $attr, $attrseq, $origtext, $line) = @_;
print "start:: current tag is " . $tagname . "\n" if ($dbg);
if ( match_block_tag($tagname) ) {
push @tag_stack, [ $tagname ];
} elsif ( match_inner_tag($tagname) ) {
$aryref = pop @tag_stack;
check_inner_tag($tagname, $$aryref[0]) || die "nest error $tagname in line $line. abort.";
if ($$aryref[$#$aryref] eq $tagname) {
bufappend "" . $tagname . ">";
pop @$aryref;
}
push @$aryref, $tagname;
push @tag_stack, $aryref;
}
if ( $dbg ) {
print "===== start:: current tag stack is =====\n";
tag_stack_to_string;
}
# print out
bufprint '<' . $tagname;
foreach $key (@$attrseq) {
bufprint ' ' . $key . '=' . '"' . $$attr{$key} . '"';
}
bufprint '>';
}
sub end {
my($self, $tagname, $origtext, $line) = @_;
my $aryref;
my $s;
print "end:: current tag is " . $tagname . "\n" if ($dbg);
if ( match_block_tag ($tagname) ) {
$aryref = pop @tag_stack;
if ( $$aryref[0] ne $tagname ) {
die "nest error $tagname in line $line. abort.";
}
while ( $#$aryref > 0 ) {
$s = pop @$aryref;
bufappend '' . $s . '>';
}
} elsif ( match_inner_tag($tagname) ) {
$aryref = pop @tag_stack;
check_inner_tag($tagname, $$aryref[0]) || die "nest error $tagname in line $line. abort.";
if ($$aryref[$#$aryref] eq $tagname) {
pop @$aryref;
}
push @tag_stack, $aryref;
}
if ( $dbg ) {
print "===== end:: current tag stack is =====\n";
tag_stack_to_string;
}
# print out
bufprint '' . $tagname . '>';
# ネスト関係が解消されていたら、早めにバッファをフラッシュ
if ( $#tag_stack == 0 ) {
bufflush;
}
}
sub text {
my($self, $origtext, $is_cdata) = @_;
bufprint $origtext;
}
sub declaration {
my($self, $origtext) = @_;
bufprint $origtext;
}
sub comment {
my($self, $origtext) = @_;
bufprint $origtext;
}
sub process {
my($self, $origtext) = @_;
bufprint $origtext;
}
sub start_document {
}
sub end_document {
bufflush;
}
sub default {
my($self, $origtext) = @_;
bufprint $origtext;
}
}
$p = MyParser->new;
$p->handler(start => "start", "self,tagname,attr,attrseq,text,line");
$p->handler(end => "end", "self,tagname,text,line");
$p->handler(text => "text", "self,text,is_cdata");
$p->handler(process => "process", "self,token0,text");
$p->handler(comment => "comment", 'self,text');
$p->handler(declaration => "declaration", 'self,text');
$p->handler(process => "process", 'self,text');
$p->handler(default => "default", 'self,text');
$p->handler(start_document => "start_document", 'self');
$p->handler(end_document => "end_document", 'self');
#
# here we go!
#
$p->parse_file("foo.html");
#
# end of file
#