【Python爬虫】添加高级功能

 解析robots.txt文件

解析robots.txt文件,以避免下载禁止爬取的URL。使用Python自带的robotparser模块,在crawl循环中添加该检查:
添加模块:

import robotparser

def link_crawler4(seed_url, link_regex):
    rp = robotparser.RobotFileParser()
    crawl_queue = [seed_url]
    # 跟踪哪个url之前见过 用set避免重复
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download3(url)
        for link in get_links(html):
            print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
            # re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
            # 而re.search匹配整个字符串,直到找到一个匹配。
            if re.search(link_regex, link):
                # 地址连接 获得正确的地址
                link = urlparse.urljoin(seed_url, link)
                if link not in seen:
                    # 把已爬取的地址添加进set集合
                    seen.add(link)
                    crawl_queue.append(link)

添加 代理支持

有时我们需要使用代理访问某个网站。比如,Netflix屏蔽了美国以外的大多数国家。

# 下载网页 version:0.3
# num_retries:重试次数
# user_agent:用户代理
# proxy
def download4(url, user_agent='daimx', proxy=None, num_retries=2):
    print 'Downloading:', url
    headers = {'User-agent': user_agent}
    request = urllib2.Request(url, headers=headers)

    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'Download error:', e.reason
        html = None
        if num_retries > 0:
            # 4xx错误发生在请求存在问题时
            # 5xx错误发生在服务端存在问题时
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # 递归 重试
                html = download4(url, user_agent, proxy, num_retries - 1)
                return html
    return html

 下载限速

如果我们爬去网站的速度过快,就会面临被封禁或造成服务器过载的风险。为了降低风险,我们可以在两次下载之间添加延时,从而对爬虫限速:

    #!/usr/bin/env python
# -*- coding:utf-8 -*-
import urlparse
import time

import datetime


class Throttle:
    """add a delay between downloads to the same  domain
        添加延迟
    """
    def __init__(self, delay):
        # 每个下载之间的延迟
        self.delay = delay
        # 上一次访问时的时间戳
        self.domains = {}

    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay-(datetime.datetime.now()-last_accessed)
            if sleep_secs >0:
                # 最近访问了域名
                # 所以需要睡眠
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.datetime.now()

避免爬虫陷阱

目前我们的爬虫会跟踪所有的之前没有访问过的链接。但一些网站会动态生产页面内容,这样就会出现无限多的网页,这样页面就会无止境的链接下去。这种情况被称为爬虫陷阱
想要避免这种情况,一个简单的方法是记录到达当前网页经过了多少个链接了。要实现这一功能,我们需要修改seen变量。该变量原先只记录访问过的网页链接,现在修改为一个字典,增加页面深度的记录:

def link_crawler5(seed_url, link_regex, max_depth):
    rp = robotparser.RobotFileParser()
    crawl_queue = [seed_url]
    # 记录爬取过的网址及其深度
    seen = {}
    while crawl_queue:
        url = crawl_queue.pop()
        html = download3(url)
        depth = seen[url]
        if depth != max_depth:
            for link in get_links(html):
                print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
                # re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
                # 而re.search匹配整个字符串,直到找到一个匹配。
                if re.search(link_regex, link):
                    # 地址连接 获得正确的地址
                    link = urlparse.urljoin(seed_url, link)
                    if link not in seen:
                        seen[link] = depth+1
                        crawl_queue.append(link)

终极版本

#!/usr/bin/evn python
# -*- coding:utf-8 -*-
import urllib2

import re

import itertools
import urlparse

import time

import datetime

"""
"""


# 首先定义download函数
def download(url, num_retries=2, user_agent='daimx', proxy=None):
    print 'Downloading:', url
    # 设置代理
    headers = {'User-agent':user_agent}
    # 设置请求头
    request = urllib2.Request(url, headers=headers)

    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme:proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print 'Download Error:', e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 < e.code < 600:
                # 遇到5XX错误,重试下载
                return download(url,num_retries-1,proxy=proxy)
    return html


# 地图爬虫--解析网站地图
def crawl_sitemap(url):
    sitemap = download(url)
    links = re.findall('(.*?)', sitemap)
    for link in links:
        html = download(link)


# ID遍历爬虫
def iter_ID():
    # 重试次数
    max_errors = 5
    # 已重试次数
    num_errors = 0
    for page in itertools.count(1):
        # --必须是ID有规律的
        url = 'http://example.webscraping.com/view/-%d' % page
        html = download(url)
        # 重试
        if html is None:
            num_errors += 1
            if num_errors == max_errors:
                break
        else:
            num_errors = 0


# 链接爬虫
def link_crawler(seed_url, link_regex, max_depth=2):
    crawl_queue = [seed_url]
    # 记录爬取过的网址 避免重复爬取
    seen = {}
    throttle = Throttle(delay=5)
    while crawl_queue:
        url = crawl_queue.pop()
        throttle.wait(url)
        html = download(url)
        depth = seen[url]
        # 检查页面深度
        if depth != max_depth:
            for link in get_links(html):
                if re.search(link_regex, link):
                    # 创建绝对链接
                    link = urlparse.urljoin(seed_url, link)
                    if link not in seen:
                        seen[link] = depth+1
                        crawl_queue.append(link)


# 返回链接列表
def get_links(html):
    # 从网页中提取所有链接的正则表达式
    webpage_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
    # 来自网页的所有链接列表
    return webpage_regex.findall(html)


class Throttle:
    """add a delay between downloads to the same  domain
        两个网页之间添加延迟
    """

    def __init__(self, delay):
        # 每个下载之间的延迟
        self.delay = delay
        # 上一次访问时的时间戳
        self.domains = {}

    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                # 最近访问了域名
                # 所以需要睡眠
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.datetime.now()


if __name__=='__main__':
    link_crawler('http://example.webscraping.com', '/(index|view)')

你可能感兴趣的:(python爬虫)