爬虫是我们获取互联网数据的一个非常有效的方法,而分布式爬虫则是利用许多台机器协调工作来加快抓取数据效率的不二途径。分布式爬虫是由访问某些原始网址开始,在获取这些网址的内容后,根据某些规则从中提取出下一轮的爬虫网址。依此类推,直到将全部链接访问完毕或达到我们设置的某些限定方可结束。这里我们需要选定一个运算效率较高的机器作为master, 和一群正常的workers来协助。这个master要负责统筹url的爬取记录、去重和分配等任务,而workers则是通过同时请求不同网页来解决单台机器请求网页效率较低导致的时间瓶颈。这里,我们需要关注下面这几个方面:
Husky作为高效的大数据分布式计算系统,其python实现的PyHusky也非常适合用来实现这里的分布式爬虫的设计。若是大家对爬虫基础了解不足或对PyHusky的使用不够熟悉,可以先查看前面的几篇博文:
首先,创建一个CrawlerConfig的类来实现爬虫初始化和相关规则的设定:
#crawler_config.py
import re
class CrawlerConfig:
def __init__(self):
# Time out after 10 seconds
self.timeout = 10
# Seed set of urls
self.urls_init = []
self.proxies = {'http': '127.0.0.1:8118',}
# urls that match these regex will be processed
self.rules_include = []
# after processing rules_include, urls that match
# the following regex will be dropped
self.rules_exclude = []
self.history = None
self.hdfspath_output = '/datasets/crawl/openrice/'
self.parse_handlers = dict()
#whether to use selenium
self.is_selenium = False
def set_rules_include(self, rules):
self.rules_include = [re.compile(rule) for rule in rules]
def set_rules_exclude(self, rules):
self.rules_exclude = [re.compile(rule) for rule in rules]
def add_parse_handler(self, pattern, handler):
self.parse_handlers[re.compile(pattern)] = handler
接着创建一个通用函数工具来具体实现网页请求、链接解析等具体的实现,命名为crawler_utils.py:
#crawler_utils.py导入必要的头文件
import requests
import grequests
import urllib2
import sys
import time
import random
import urlparse
from lxml import html
from selenium.webdriver.common.proxy import *
from selenium import webdriver
在crawler_utils.py中添加请求网页源码的函数实现:
# 静态网页请求
def download_content(url, to_secs, proxies):
retries = 0
timeout = 10
while True:
try:
(proto, addr) = random.choice(proxies)
agent = random.choice(['Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36'])
headers = {'User-Agent':agent}
response=requests.get(url, timeout=timeout, proxies={proto:addr}, headers=headers)
sys.stderr.write('\033[0;32mdownloaded: \033[0m'+url+'\033[0;31m'+'---retries: '+str(retries)+' times\033[0m\n')
return response.content
except requests.exceptions.RequestException as e:
retries += 1
timeout = min(to_secs, timeout+10)
if retries == 10:
break
sys.stderr.write('request failed: '+url+" "+str(e)+"\n")
time.sleep(random.choice([5, 8]))
continue
# 动态网页请求解决方案,调用selenium和phantomjs
def download_content_selenium(url, to_secs):
retries = 0
timeout = 20
while True:
try:
phantomjs_path = '/data/opt/brew/lib/node_modules/phantomjs/lib/phantom/bin/phantomjs'
service_args = [
'--proxy=http://xxxx:8000/',
'--proxy-type=http',
]
driver = webdriver.PhantomJS(executable_path=phantomjs_path,service_args=service_args)
driver.set_page_load_timeout(timeout)
driver.set_script_timeout(timeout)
driver.get(url)
html_source = driver.page_source.encode('utf8')
sys.stderr.write('\033[0;32mThis is selenium-----downloaded: \033[0m'+url+'\033[0;31m'+'---retries: '+str(retries)+' times\033[0m\n')
driver.quit()
return html_source
except:
try:
driver.quit()
sys.stderr.write('\033[0;33mDriver quit success '+str(retries)+'\033[0m')
except:
sys.stderr.write('\033[0;31mQuit fail \033[0m')
pass
retries += 1
if retries == 4:
break
timeout = min(to_secs, timeout+10)
sys.stderr.write('request failed: '+url+" "+"\n")
time.sleep(random.choice([5, 8]))
continue
在crawler_utils.py中添加提取网页源码内所包含的链接的函数实现:
def urlprocess(base_url, url):
return urlparse.urljoin(base_url, url.split('#')[0])
def urlpredicate(url, rules_include, rules_exclude):
if(not url.startswith('http')):
return None
included = False
matched_url = ""
for pattern in rules_include:
if pattern.match(url) != None:
included = True
matched_url = pattern.match(url).group()
break
if not included:
return None
for pattern in rules_exclude:
if pattern.match(url) != None:
return None
return matched_url
#按规则返回特定格式的url
def parse_browser_removetag_userules(_url, response, rules_include, rules_exclude):
try:
parsed_body=html.fromstring(response)
joined_link = []
for url in parsed_body.xpath('//a/@href'):
url = urlpredicate(unicode(urlprocess(_url, url)), rules_include, rules_exclude)
if url == None:
continue
joined_link.append(unicode(urlprocess(_url, url)))
return joined_link
except:
print 'parse failed: '+response
return []
以上完成了相关设置类和网页源码请求和链接解析等函数工具,下面我将详细介绍如何使用PyHusky调用以上工具来完成分布式爬虫的具体实现,命名为crawler_dist.py:
#crawler_dist.py
import timeit
import re
import json
#PyHusky
import bindings.frontend as fe
import bindings.frontend.env
#crawler_utils.py存放在hcrawl文件夹中
from hcrawl import crawler_utils as ct
#为map()选择所需的静态或动态网页请求函数,默认is_selenium=False
def mapper_download(url, timeout, proxies,is_selenium):
if is_selenium:
response=ct.download_content_selenium(url, timeout, proxies)
else:
response=ct.download_content(url, timeout, proxies)
return (url, response)
#为flat_map()返回对应网页包含的符合规则之链接列表
def mapper_parse(htmltuple, rules_include, rules_exclude):
if(htmltuple[1] is not None):
res=ct.parse_browser_removetag_userules(htmltuple[0], htmltuple[1], rules_include, rules_exclude)
return res
else:
return []
#为map()提取对应源码中有用的数据特征,如何提取在config中定义
def parse_html(config, url_html_pair):
parse_handlers = config.parse_handlers
def mapper((url, html)):
for pattern in parse_handlers:
if pattern.match(url) != None:
return parse_handlers[pattern](pattern.match(url).group(), html)
return json.dumps({"type":"handlers_fail", "url":url})
return url_html_pair.map(mapper)
def log_msg(msg):
print "[pyhusky-log]:"+msg
承上,下面的crawler_run函数便使用到优秀的PyHusky,便是这里实现分布式爬虫的关键:
def crawler_run(config, existing_progress=None):
#读取urls数据,如果是list则使用parallelize将urls分配到各个机器上
if type(config.urls_init) is list:
this_round_urls = fe.env.parallelize(config.urls_init).cache()
history = fe.env.parallelize(config.urls_init).cache()
#否则使用load从HDFS地址中读取数据
if type(config.urls_init) is str:
this_round_urls = fe.env.load(config.urls_init).cache()
history = fe.env.load(config.history).cache()
iter = 0
start = timeit.default_timer()
next_len = 1
#爬虫进行到底,直到下一轮urls个数next_len为零
while next_len:
iter += 1
log_msg("downloading")
#调用map(),在各个机器上运行mapper_download函数,返回url和对应的pagesource
this_round_htmltuple = this_round_urls.map(lambda url: mapper_download(url, config.timeout, config.proxies, config.is_selenium)).cache()
log_msg("urls: "+str(this_round_htmltuple.count()))
log_msg("writing to hdfs")
#parse_html()提取pagesource中的所需的数据特征,写到HDFS
parse_html(config, this_round_htmltuple).write_to_hdfs(config.hdfspath_output+str(iter))
#flat_map()可以通过mapper_parse返回的urls收集下一轮的urls
next_round_urls = this_round_htmltuple.flat_map(lambda htmltuple: mapper_parse(htmltuple, config.rules_include, config.rules_exclude)) \
#以下的map(),reduce_by_key(),去重
.map(lambda url: (url, 1)) \
.reduce_by_key(lambda a, b: a+b) \
.map(lambda (url, cnt): url) \
.cache()
log_msg("next round urls count: " + str(next_round_urls.count()))
#uncache(),释放内存
this_round_htmltuple.uncache()
log_msg("calculating difference with history")
history = history.map(lambda x:x.decode('utf8')).cache()
#使用difference(),与历史记录history对比,可以剔除已经爬取过的urls
url_diff = next_round_urls.difference(history).cache()
# 释放this_round_urls所占内存,并将url_diff赋值给新的this_round_urls
this_round_urls.uncache()
this_round_urls = url_diff.cache()
log_msg("create new history")
#将新的urls添加到历史记录中,达到每一轮都不会重复爬取
next_round_history = history.concat(url_diff).cache()
history.uncache()
history = next_round_history
print 'husky_crawler_info: round '+str(iter)+' speed: '+str(next_len)+' pages within '+str((timeit.default_timer()-start))+' seconds'
#下一轮有效urls个数
next_len = url_diff.count()
url_diff.uncache()
hlen = history.count()
start = timeit.default_timer()
print 'husky_crawler_info: history size: ', hlen-next_len, ', next round size: ', next_len
#轮数限制,若把num_iters设置为负数,则可逃避限制
if iter == config.num_iters:
break
到目前为止,我们依托优秀的分布式计算工具PyHusky,已经可以愉快的实现同时在多台机器上快速地获取互联网数据的初衷。接下来的博文,我将分别给出针对静态和动态两种类型的网站的数据获取的具体操作。