#!/usr/local/bin/perl # # Perl5 + p5-HTML-Parser 専用 # ul li /li /ul を処理して、閉じタグの省略をおぎなう。 # ol li /li /ol を処理して、閉じタグの省略をおぎなう。 # dl dt /dt dd /dd /dlを処理して、閉じタグの省略をおぎなう。 # # 2001.11.02 first version # 2001.11.09 ul / ol / dl のハードコーディングから、リスト形式のデータに変更。 # # $Id: html-tag-ul3,v 1.2 2001/11/09 08:07:01 george Exp $ # use HTML::Parser(); # use declare # # subclassing and method callbacks for HTML::Parser # { package MyParser; use base 'HTML::Parser'; my $dbg = 0; # デバッグ用フラグ # 処理対象タグリスト指定 # 各リストの先頭1要素はブロック開始/終了タグ。 # 各リストの先頭2要素以降は内部用タグ。 @proc_tag_list = ( [ "ul" , "li" ] , [ "ol" , "li" ] , [ "dl" , "dt" , "dd"] ); my $outbuf; my @tag_stack; sub bufprint { my $s; print "bufprint::start\n" if ($dbg > 5); foreach $s (@_) { $outbuf .= $s; print "bufprint::" . $s . "\n" if ($dbg > 5); } print "bufprint:: outbuf string is " . $outbuf . "\n" if ($dbg > 5); } sub bufflush { print "bufflush::start\n" if ($dbg); print $outbuf; $outbuf = ""; } sub tag_stack_to_string { my $aryref; print "#tag_stack dump start.\n"; foreach $aryref (@tag_stack) { foreach $s (@$aryref) { print $s . " "; } print "\n"; } print "#tag_stack dump end.\n"; } sub bufappend { my $s; my $right_spc; print "bufappend::start\n" if ($dbg); # 右側の改行やスペースなどを分離する。 $right_spc = ""; if ( $outbuf =~ /(.*?)([\f\t\s\r\n]+)$/s ) { $outbuf = $1; $right_spc = $2; print "bufappend:: left = :" . $outbuf . ": right = :" . $right_spc . ":\n" if ($dbg); } foreach $s (@_) { $outbuf .= $s; } $outbuf .= $right_spc; print "bufappend:: final outbuf = :" . $outbuf . ":\n" if ($dbg); } sub match_block_tag { my ($s) = @_; my $iref; foreach $iref (@proc_tag_list) { if ( $s eq $$iref[0] ) { return 1; } } return 0; } sub match_inner_tag { my ($s) = @_; my $iref; my $irefin; foreach $iref (@proc_tag_list) { for ( $i = 1 ; $i <= $#$iref ; $i++ ) { if ( $s eq $$iref[$i] ) { return 1; } } } return 0; } sub check_inner_tag { my ($intag, $blocktag) = @_; my $iref; my $irefin; foreach $iref (@proc_tag_list) { for ( $i = 1 ; $i <= $#$iref ; $i++ ) { if ( $intag eq $$iref[$i] ) { if ( $blocktag eq $$iref[0] ) { return 1; } else { return 0; } } } } return 0; } sub start { my ($self, $tagname, $attr, $attrseq, $origtext, $line) = @_; print "start:: current tag is " . $tagname . "\n" if ($dbg); if ( match_block_tag($tagname) ) { push @tag_stack, [ $tagname ]; } elsif ( match_inner_tag($tagname) ) { $aryref = pop @tag_stack; check_inner_tag($tagname, $$aryref[0]) || die "nest error $tagname in line $line. abort."; if ($$aryref[$#$aryref] eq $tagname) { bufappend ""; pop @$aryref; } push @$aryref, $tagname; push @tag_stack, $aryref; } if ( $dbg ) { print "===== start:: current tag stack is =====\n"; tag_stack_to_string; } # print out bufprint '<' . $tagname; foreach $key (@$attrseq) { bufprint ' ' . $key . '=' . '"' . $$attr{$key} . '"'; } bufprint '>'; } sub end { my($self, $tagname, $origtext, $line) = @_; my $aryref; my $s; print "end:: current tag is " . $tagname . "\n" if ($dbg); if ( match_block_tag ($tagname) ) { $aryref = pop @tag_stack; if ( $$aryref[0] ne $tagname ) { die "nest error $tagname in line $line. abort."; } while ( $#$aryref > 0 ) { $s = pop @$aryref; bufappend ''; } } elsif ( match_inner_tag($tagname) ) { $aryref = pop @tag_stack; check_inner_tag($tagname, $$aryref[0]) || die "nest error $tagname in line $line. abort."; if ($$aryref[$#$aryref] eq $tagname) { pop @$aryref; } push @tag_stack, $aryref; } if ( $dbg ) { print "===== end:: current tag stack is =====\n"; tag_stack_to_string; } # print out bufprint ''; # ネスト関係が解消されていたら、早めにバッファをフラッシュ if ( $#tag_stack == 0 ) { bufflush; } } sub text { my($self, $origtext, $is_cdata) = @_; bufprint $origtext; } sub declaration { my($self, $origtext) = @_; bufprint $origtext; } sub comment { my($self, $origtext) = @_; bufprint $origtext; } sub process { my($self, $origtext) = @_; bufprint $origtext; } sub start_document { } sub end_document { bufflush; } sub default { my($self, $origtext) = @_; bufprint $origtext; } } $p = MyParser->new; $p->handler(start => "start", "self,tagname,attr,attrseq,text,line"); $p->handler(end => "end", "self,tagname,text,line"); $p->handler(text => "text", "self,text,is_cdata"); $p->handler(process => "process", "self,token0,text"); $p->handler(comment => "comment", 'self,text'); $p->handler(declaration => "declaration", 'self,text'); $p->handler(process => "process", 'self,text'); $p->handler(default => "default", 'self,text'); $p->handler(start_document => "start_document", 'self'); $p->handler(end_document => "end_document", 'self'); # # here we go! # $p->parse_file("foo.html"); # # end of file #