[经验总结]Perl提取网页信息

  
  
  
  
#!/usr/bin/perl -w
# Gist: https://gist.github.com/2928006
use LWP:: Simple ;
my $url = $ARGV [ 0 ];
my $filename = $ARGV [ 1 ];
  
my $content = get ( $url )
     or die "Couldn't get $url" ;

#$content =~ s#^.*?(<div.*?</div>).*$##m;

if ( $content =~ m #.*(<div id="enText" style="display:block">.*?</div>).*#sg)
{
     $text = $1 ;

     # 打开模板文件
     open ( TEMPLATE , "template.html" )
             or die " Couldn't open template.html for writing: $! " ;

     # 读取模板文件
     $/ = "</html>" ; # 读到</html>结束
     my $reads = <TEMPLATE> ;

     # 替换听力文本
     $reads =~ s/==TEXT_CONTENT==/$text/gix ;
     #print $reads;

     # 输出html文件
     open ( OUT , " > $filename " )
             or die " Couldn't open $filename for writing: $! " ;
     print OUT $reads ;

     # 下载听力
     my $baseUrl = $url ;
     $baseUrl =~ s/(.*)(// .* /.html)/ $1 / g ;
     my $reslink = $content ;
     $reslink =~ s/.*<a href="(.*?)" title="进入下载资料页面">下载听力<// a >.* /$1/sg ;
     $reslink = $baseUrl . "/" . $reslink ;

     print "/nreslink:" , $reslink , "/n" ;
     my $respage = get ( $reslink )
         or die "Couldn't get $reslink" ;

     my $mp3link = $respage ;
     print $mp3link ;
#$mp3link =~ s#.*<a href="(.*?)" target="_blank"><img src="/images/downloadurl1/.jpg"></a>.*#$1#sg;
#if ($mp3link =~ m/.*<a href="(.*?)" target="_blank"><img src="//images//downloadurl1/.jpg"><//a>.*/sg) {

     # 没登录,下载链接获取不到,怎么办?
     if ( $mp3link =~ m/downloadurl1/sg ) {
         print "匹配/n" ;
     }
     else {
         print "不匹配/n" ;
     }
     #<a href="(.*?)" target="_blank"><img src="/images/downloadurl1.jpg"></a>

     print "/ndownload:" . $reslink . "/n" ;
}
else {
     print "不匹配/n" ;
}

你可能感兴趣的:(html,perl,url,div)