#!/usr/bin/perl
use utf8;
use Data::Dumper qw(Dumper);
use HTML::Element;
use HTML::TreeBuilder;
#binmode(STDIN,':encoding(utf8)');
#binmode(STDOUT,':encoding(utf8)');
binmode STDOUT,"utf8";
#binmode(STDERR,':encoding(utf8)');
$Data::Dumper::Indent = 1 ;
#foreach my $file_name (@ARGV){
my $file_name = "huxiu-webDetail";
unless(-e $file_name){
print "$file_name is not exsit\n";
}
open(DATA ,$file_name);
binmode DATA,"utf8";
my $tree = HTML::TreeBuilder->new;
$tree->parse_file(\*DATA);
# $title = $tree->find_by_tag_name('title');
# @desc = $tree->find_by_tag_name('description');
# @link = $tree->find_by_tag_name('link');
# @image = $tree->find_by_tag_name('image');
# foreach(@title){
# print $title,"\n";
# }
# $title = $tree->find_by_tag_name('title');
$head = $tree->find_by_tag_name("head");
$body = $tree->find_by_tag_name("body");
# @metacontent = $meta->content_list;
# print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'src'},"\n";
# _parent is a key of Hash,and the value is ref hash Array.
#_content is a key of Arry,and the value is ref hash Arry.
$var_par = $head->{'_parent'};
$var_con = $head->{'_content'};
$var_tag = $head->{'_tag'};
foreach $key( keys %head)
{
# print $key,"\n";
}
# print $var_con;
foreach $key(keys %$var_par)
{
# print $key,"\n";
}
while(($key,$value)=each%$var_par)
{
# print "$key=>$value\n";
}
#########################################################
# #
# print ALL Hash key and Hash value in Head`s _content #
# #
#########################################################
print "=========================================\n";
my $icon_count = @$var_con - 1;
for my $i (0 .. $icon_count)
{
my $hash = $var_con->[$i];
foreach my $key(keys %$hash)
{
# print $key,"\n";
}
}
print "========================================\n";
# foreach $key(keys %($var_con[0]))
# {
# print $key,"\n";
# }
# foreach $key(keys (%$var_tag))
# {
# print $key,"\n";
# }
# foreach $key(keys %$body)
# {
# print $keys,"\n";
# }
print $var_par->{'_content'},"\n";
print $var_con,"\n";
print $var_tag,"\n";
# print $i=@$var_con,"\n";
# print $var_par->{'_content'}[0],"\n";
# print $var_par->{'_content'}[0]{'_content'}[0],"\n";
# print $var_con->[0],"\n";
sub printcontent{
my $vax = @_->[0];
my $tag = @_->[1];
my $icount = @$vax-1;
# print $icount+1,"\n";
# print $vax->[0],"\n";
for my $i(0 .. $icount){
# print $i,$vax->[$i],"\n";
# print $i,$vax->[$i]{'_tag'},"\n";
# if( @$vax->[$i]{'_content'}!=())
# {
# print $i,":";
# printcontent ($vax->[$i]{'_content'});
# }
# elsif($vax->[$i]{'content'}!=undef)
# {
# print $i,":";
# printcontent ($vax[$i]{'content'});
# }
# else
# {
my $hash = $vax->[$i];
foreach my $key(keys %$hash)
{
if($key ne "_parent"){
print $i,":",$key,"=";
print $vax->[$i]{$key},"\n";
}
elsif($key == '_content')
{
# Dumper $key,"\n";
if(@$vax->[$i]{'_content'}[0]{'_content'}!=()){
# print $i,":_content=",$vax->[$i]{'_content'}[0],"\n";
printcontent($vax->[$i]{'_content'});
}
else{
print $i,":_content============",$vax->[$i]{'_content'}[0],"\n";
}
}
}
# }
}
}
# printcontent($var_par->{'_content'});
printcontent ($var_par->{'_content'});
print "\n";
# print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0],"\n";
# print $var_con->[1]{'_content'}[0],"\n";
# print $var_con->[2]{'content'},"\n";
# print $var_con->[2]{'_tag'},"\n";
# print $t=@$var;
# print Dumper($head);
# foreach( @metacontent)
# {
# print $_,"\n";
# }
# print Dumper($tree), "\n";
# print $title->as_text(),"\n";
# print $body->as_text(),"\n";
# :q@p = $tree->find_by_tag_name("body")->content_list;
# @headcontent = $head->content_list;
# @bodycontent = $body->content_list;
# print Dumper(@headcontent);
# print Dumper(@bodycontent),"\n";
# foreach(@headcontent)
# {
# print $_->as_text(),"\n" ;
# }
$tree = $tree->delete;
close(DATA);
#}
功能
把HTML标签转化为perl的数据结构
找出tag和对应的值。
能够攫取网页内容与格式。
不足:
_content会多打一个,要在第一个if语句中过滤。小问题。这种类型的还比较多
找不到内容对应的原来格式。即没有做内容与原来格式的关联。大问题。功能不完善。下一步的重点。