My first silly-dummy-crappy web-crawler perl program

自己写的第一个玩具Perl 爬虫程序,主要是体验Perl语言的方便(和强大?)

#!/usr/bin/perl

use warnings;
use strict;
use 5.010;
use LWP::UserAgent;
use HTML::LinkExtor;
use Thread;
use threads;
use threads::shared;

#my $feedLink = "http://www.google.com.hk/webhp?rls=ig";
my $feedLink = "http://www.163.com";

sub linkHandler
{
  my ($crawledURLs, $urlsToBeCrawled) = (@_);
  sub
  {
    my ($tag, %values) = @_;
    if($tag eq 'a')
    {
      foreach my $key (keys %values)
      {
        if($key eq 'href')
        {
          my $link = $values{$key};
          lock($crawledURLs);
          if(($link =~ m/^http:\/\//) && !(exists $crawledURLs->{$link}))
          {
            $crawledURLs->{$link} = 1;
            lock($urlsToBeCrawled);
            push @{$urlsToBeCrawled}, $link;
            cond_signal($urlsToBeCrawled);
          }
        }
      }
    }
  }
}

sub doCrawlWebs
{
  my ($crawledURLs, $urlsToBeCrawled, $pageCnt) = (@_);
  my $crawler = LWP::UserAgent->new;
  $crawler->agent("Mazilla/5.0");

  my $done = 0;
  while(! $done)
  {
    my $url;
    {
      lock($urlsToBeCrawled);
      while(@{$urlsToBeCrawled} == 0)
      {
        cond_wait($urlsToBeCrawled);
      }
      $url = pop @{$urlsToBeCrawled};
      if(++$pageCnt >= 10000)
      {
        $done = 1;
        continue;
      }
    }
    print "Thr ", Thread->self->tid, " is crawling $url\n";
    open my ($fileHandle), '>', "$pageCnt.html";
    if(!defined($fileHandle))
    {
      print("failed to open $pageCnt.html: $!\n");
      continue;
    }
    my $response = $crawler->get($url);
    print $fileHandle $response->content();
    close($fileHandle);
    my $linkExtractor = HTML::LinkExtor->new(&linkHandler($crawledURLs, $urlsToBeCrawled));
    $linkExtractor->parse($response->content);
  }
}

sub crawlWebs
{
  my @urlsToBeCrawled: shared = (@_);
  my %crawledURLs: shared;
  my $pageCnt: shared = 0;

  my @args = (\%crawledURLs, \@urlsToBeCrawled, $pageCnt);
  my @crawlerThrs;
  for(my $i = 0; $i < 30; ++$i)
  {
    push @crawlerThrs, Thread->new(\&doCrawlWebs, @args);
  }

  foreach (@crawlerThrs)
  {
    $_->join;
  }
}

crawlWebs($feedLink);


你可能感兴趣的:(html,perl,url,语言,Signal)