2014-05-02

#!/usr/bin/perl

use utf8;
use Data::Dumper qw(Dumper);
use HTML::Element;
use HTML::TreeBuilder;

#binmode(STDIN,':encoding(utf8)');
#binmode(STDOUT,':encoding(utf8)');
binmode STDOUT,"utf8";
#binmode(STDERR,':encoding(utf8)');
$Data::Dumper::Indent = 1 ;

#foreach my $file_name (@ARGV){
my $file_name = "huxiu-webDetail";
unless(-e $file_name){
    print "$file_name is not exsit\n";
}
    open(DATA ,$file_name);   
    binmode DATA,"utf8";
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file(\*DATA);
   
#    $title = $tree->find_by_tag_name('title');
#    @desc = $tree->find_by_tag_name('description');
#    @link = $tree->find_by_tag_name('link');
#    @image = $tree->find_by_tag_name('image');
   
#    foreach(@title){
#        print $title,"\n";
#    }
#    $title = $tree->find_by_tag_name('title');
    $head = $tree->find_by_tag_name("head");
    $body = $tree->find_by_tag_name("body");
#    @metacontent = $meta->content_list;
#    print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'src'},"\n";
   
# _parent is a key of Hash,and the value is ref hash Array.
#_content is a key of Arry,and the value is ref hash Arry.
    $var_par = $head->{'_parent'};
    $var_con = $head->{'_content'};
    $var_tag = $head->{'_tag'};

    foreach $key( keys %head)
    {
#        print $key,"\n";
    }
#    print $var_con;
    foreach $key(keys %$var_par)
    {
#        print $key,"\n";
    }
    while(($key,$value)=each%$var_par)
    {
#        print "$key=>$value\n";
    }
#########################################################
#                                                        #
# print ALL Hash key and Hash value in  Head`s _content #
#                                                        #
#########################################################
   
    print "=========================================\n";
    my $icon_count = @$var_con - 1;
    for my $i (0 .. $icon_count)
    {
        my $hash = $var_con->[$i];
        foreach my $key(keys %$hash)
        {
#            print $key,"\n";
        }
    }
    print "========================================\n";
#    foreach $key(keys %($var_con[0]))
#    {
#        print $key,"\n";
#    }
#    foreach $key(keys (%$var_tag))
#    {
#        print $key,"\n";
#    }
#    foreach $key(keys %$body)
#    {
#        print $keys,"\n";
#    }

    print $var_par->{'_content'},"\n";
    print $var_con,"\n";
    print $var_tag,"\n";
#    print $i=@$var_con,"\n";
#    print $var_par->{'_content'}[0],"\n";
#    print $var_par->{'_content'}[0]{'_content'}[0],"\n";
#    print $var_con->[0],"\n";
    sub printcontent{
        my $vax = @_->[0];
        my $tag = @_->[1];
        my $icount = @$vax-1;
#        print $icount+1,"\n";
#        print  $vax->[0],"\n";
        for my $i(0 .. $icount){
#            print  $i,$vax->[$i],"\n";
#            print $i,$vax->[$i]{'_tag'},"\n";
#            if( @$vax->[$i]{'_content'}!=())
#            {
#                print $i,":";
#                printcontent ($vax->[$i]{'_content'});
#            }
#            elsif($vax->[$i]{'content'}!=undef)
#            {
#                print $i,":";
#                printcontent ($vax[$i]{'content'});
#            }
#            else
#            {
                my $hash = $vax->[$i];
                foreach my $key(keys %$hash)
                {
                    if($key ne "_parent"){
                        print $i,":",$key,"=";
                        print $vax->[$i]{$key},"\n";
                    }
                    elsif($key ==  '_content')
                    {
#                        Dumper $key,"\n";
                        if(@$vax->[$i]{'_content'}[0]{'_content'}!=()){
#                            print $i,":_content=",$vax->[$i]{'_content'}[0],"\n";
                            printcontent($vax->[$i]{'_content'});
                        }
                        else{
                            print $i,":_content============",$vax->[$i]{'_content'}[0],"\n";
                        }
                    }
                }
#            }
        }
    }
#    printcontent($var_par->{'_content'});

    printcontent ($var_par->{'_content'});
    print "\n";
#    print  $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0],"\n";
#    print $var_con->[1]{'_content'}[0],"\n";
#    print $var_con->[2]{'content'},"\n";
#    print $var_con->[2]{'_tag'},"\n";
#    print $t=@$var;
#    print Dumper($head);
#    foreach( @metacontent)
#    {
#        print $_,"\n";
#    }
   
   
#    print Dumper($tree), "\n";
#    print $title->as_text(),"\n";
#    print $body->as_text(),"\n";
#    :q@p = $tree->find_by_tag_name("body")->content_list;
#    @headcontent = $head->content_list;
#    @bodycontent = $body->content_list;   
   
#    print Dumper(@headcontent);
#    print Dumper(@bodycontent),"\n";
#    foreach(@headcontent)
#    {
#        print $_->as_text(),"\n" ;
#    }

    $tree = $tree->delete;
    close(DATA);
#}

 

功能

把HTML标签转化为perl的数据结构

找出tag和对应的值。

能够攫取网页内容与格式。

 

不足:

_content会多打一个,要在第一个if语句中过滤。小问题。这种类型的还比较多

找不到内容对应的原来格式。即没有做内容与原来格式的关联。大问题。功能不完善。下一步的重点。

你可能感兴趣的:(2014-05-02)