百度收录信息抓取

#!/usr/bin/perl
=pod 前段时间朋友让我帮他写个小工具用来从这个网站http://www.baidu.com/s?wd=site%3A 上来抓取一些信息比如:域名,www.example.com,百度权重,0,站内链接,域名IP,x.x.x.x,同IP网站,0,域名年龄,2月19天,索引量-百度收录,7等这些信息,抽时间就写了一下,我主要是通 LWP::Simple和正则来扑捉关键字。 =cut
#!/usr/bin/perl
use strict;

use utf8;
use Encode;
use Encode::CN;
use File::Find;
use LWP::Simple;
use POSIX qw(strftime);
binmode(STDOUT, ":utf8");


#时间
my ($sec,$min,$hour,$day,$mon,$year,$wday,$yday,$isdst)=localtime(time());
$year = $year+1900;
$mon = $mon + 1;
my $date="$year-$mon-$day";
my $count;

mkdir ("D:\\seo_web",0775) if ! -e  "D:\\seo_web";
if (! -e "D:\\seo_web\\url.txt" ) {
    print "Please create 'D:\\seo_web\\url.txt' file\n and write info to 'D:\\seo_web\\url.txt'\n example:www.example.com in url.txt\n";
    sleep(15);
    print "\n";
}


open(my $FL, ">D:\\seo_web\\$date.csv.txt") or die "Cant write 'D:\\seo_web\\$date.csv.txt':$!\n";

open(BQURL, "<D:\\seo_web\\url.txt")
   or die "Can't open 'D:\\seo_web\\url.txt':$!\n";
while (<BQURL>) {
    my $val = "$_";
    my $url = "http://seo.chinaz.com/?host=$val/";
    my $content  = get encode("utf-8",$url);
    die  "Couldn't get $url" unless defined $content;

   for($content)
   {
       $count++;
       print $FL "$count,";
       print "$count,";
       if (/(请输入网站地址.*\/")/)
       {
           my $n = "$1";
           if ($n =~/(\w+\.\w+\.\w+)/) {
               print $FL encode("utf8","域名,$1,");
               print "$1,";
           }
       }
       if (/(?:<span\s+style=\"\s+margin-left\:\d+px\;\">)(百度权重)/) {
           print $FL encode("utf8","$1,");
           my $n = "$1";
       }
       if (/(?:images\/baiduapp\/)(\d+)(?:\.gif)/) {
               print $FL encode("utf8","$1,");
               print "$1,";
        }

        if (/(出站链接.*<\/a>)/) {
            my $n = "$1\n";
               if ($n =~/(\d+)/) {
                  print $FL encode("utf8","出站链接,$1,");
                  print "$1,";
               }else{
                print $FL encode("utf8","出站链接,none,");
                print "none,";
               }
        }   
        if (/(站内链接.*<\/span>)/) {
             my $n = "$1";
            if ($n =~ /(站内链接)/) {
               print $FL encode("utf8","$1,");
               if ($n =~ /(\d+)/) {
                $1 = "none" if (!$1);
                print $FL encode("utf8","$1,");
                print "$1,";
               }
           }
        }
        if (/(域名IP.*&nbsp;&nbsp;)/) {
            my $n = "$1";
             if ($n =~/(\d+\.\d+\.\d+\.\d+)/) {
                $1 = "none" if (!$1);
                 print $FL encode("utf8","域名IP,$1,");
                 print "$1,";
             }
        }
         if (/(同IP网站.*个)/) {
             my $n = "$1";
             if ($n =~ /(\d+)/) {
                print $FL encode("utf8","同IP网站,$1,");
                print "$1,";
            }
        }
        if (/(域名年龄.*<\/font>)/) {
             my $n = "$1";
             if ($n = ~ /(\d+月\d+天)/) {
                $1 = "none" if (!$1);
                print $FL encode("utf8","域名年龄,$1,");
                my $v = "$1";
                $v=~ s/月/ Months /;
                $v=~ s/天/ Days/;
                print "$v,";
            }

        }
        my $url = get("http://www.baidu.com/s?wd=site%3A$val");
        for($url)
       {
            if(/(该网站共有.*<\/b>)/)
            {
                my $n = "$1";
                if($n =~ /(\d+)(?:<\/b>)/)
                {
                   $1 = "none" if (!$1);
                   print $FL encode("utf8","索引量-百度收录,$1");
                   print "$1";
                }
            }
        }  
    }
    print $FL "\n";
    print "\n";
}
print "\n";
print " File path is:'D:\\seo_web\\$date.csv.txt'\n";
print " Author\@Laomeng\nEmail:18682093512\@163.com\n";
sleep(60);

你可能感兴趣的:(百度,LWP-Simpl)