解析robots.txt文件,以避免下载禁止爬取的URL。使用Python自带的robotparser模块,在crawl循环中添加该检查:
添加模块:
import robotparser
def link_crawler4(seed_url, link_regex):
rp = robotparser.RobotFileParser()
crawl_queue = [seed_url]
# 跟踪哪个url之前见过 用set避免重复
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download3(url)
for link in get_links(html):
print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
# re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
# 而re.search匹配整个字符串,直到找到一个匹配。
if re.search(link_regex, link):
# 地址连接 获得正确的地址
link = urlparse.urljoin(seed_url, link)
if link not in seen:
# 把已爬取的地址添加进set集合
seen.add(link)
crawl_queue.append(link)
有时我们需要使用代理访问某个网站。比如,Netflix屏蔽了美国以外的大多数国家。
# 下载网页 version:0.3
# num_retries:重试次数
# user_agent:用户代理
# proxy
def download4(url, user_agent='daimx', proxy=None, num_retries=2):
print 'Downloading:', url
headers = {'User-agent': user_agent}
request = urllib2.Request(url, headers=headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
if num_retries > 0:
# 4xx错误发生在请求存在问题时
# 5xx错误发生在服务端存在问题时
if hasattr(e, 'code') and 500 <= e.code < 600:
# 递归 重试
html = download4(url, user_agent, proxy, num_retries - 1)
return html
return html
如果我们爬去网站的速度过快,就会面临被封禁或造成服务器过载的风险。为了降低风险,我们可以在两次下载之间添加延时,从而对爬虫限速:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urlparse
import time
import datetime
class Throttle:
"""add a delay between downloads to the same domain
添加延迟
"""
def __init__(self, delay):
# 每个下载之间的延迟
self.delay = delay
# 上一次访问时的时间戳
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay-(datetime.datetime.now()-last_accessed)
if sleep_secs >0:
# 最近访问了域名
# 所以需要睡眠
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()
目前我们的爬虫会跟踪所有的之前没有访问过的链接。但一些网站会动态生产页面内容,这样就会出现无限多的网页,这样页面就会无止境的链接下去。这种情况被称为爬虫陷阱。
想要避免这种情况,一个简单的方法是记录到达当前网页经过了多少个链接了。要实现这一功能,我们需要修改seen变量。该变量原先只记录访问过的网页链接,现在修改为一个字典,增加页面深度的记录:
def link_crawler5(seed_url, link_regex, max_depth):
rp = robotparser.RobotFileParser()
crawl_queue = [seed_url]
# 记录爬取过的网址及其深度
seen = {}
while crawl_queue:
url = crawl_queue.pop()
html = download3(url)
depth = seen[url]
if depth != max_depth:
for link in get_links(html):
print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
# re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
# 而re.search匹配整个字符串,直到找到一个匹配。
if re.search(link_regex, link):
# 地址连接 获得正确的地址
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen[link] = depth+1
crawl_queue.append(link)
#!/usr/bin/evn python
# -*- coding:utf-8 -*-
import urllib2
import re
import itertools
import urlparse
import time
import datetime
"""
"""
# 首先定义download函数
def download(url, num_retries=2, user_agent='daimx', proxy=None):
print 'Downloading:', url
# 设置代理
headers = {'User-agent':user_agent}
# 设置请求头
request = urllib2.Request(url, headers=headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
html = opener.open(request).read()
except urllib2.URLError as e:
print 'Download Error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 < e.code < 600:
# 遇到5XX错误,重试下载
return download(url,num_retries-1,proxy=proxy)
return html
# 地图爬虫--解析网站地图
def crawl_sitemap(url):
sitemap = download(url)
links = re.findall('(.*?) ', sitemap)
for link in links:
html = download(link)
# ID遍历爬虫
def iter_ID():
# 重试次数
max_errors = 5
# 已重试次数
num_errors = 0
for page in itertools.count(1):
# --必须是ID有规律的
url = 'http://example.webscraping.com/view/-%d' % page
html = download(url)
# 重试
if html is None:
num_errors += 1
if num_errors == max_errors:
break
else:
num_errors = 0
# 链接爬虫
def link_crawler(seed_url, link_regex, max_depth=2):
crawl_queue = [seed_url]
# 记录爬取过的网址 避免重复爬取
seen = {}
throttle = Throttle(delay=5)
while crawl_queue:
url = crawl_queue.pop()
throttle.wait(url)
html = download(url)
depth = seen[url]
# 检查页面深度
if depth != max_depth:
for link in get_links(html):
if re.search(link_regex, link):
# 创建绝对链接
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen[link] = depth+1
crawl_queue.append(link)
# 返回链接列表
def get_links(html):
# 从网页中提取所有链接的正则表达式
webpage_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
# 来自网页的所有链接列表
return webpage_regex.findall(html)
class Throttle:
"""add a delay between downloads to the same domain
两个网页之间添加延迟
"""
def __init__(self, delay):
# 每个下载之间的延迟
self.delay = delay
# 上一次访问时的时间戳
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
# 最近访问了域名
# 所以需要睡眠
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()
if __name__=='__main__':
link_crawler('http://example.webscraping.com', '/(index|view)')