第二十天 异步爬虫(1)
今天计划用Python开发一套异步爬虫框架用来补充blog内容,学习项目及练习源码地址:
GitHub源码
框架设计构想
框架允许不同的蜘蛛
因为每一个想爬取的站点内容结构基本上是不一致的,所以实际应用中针对各个站点建立不同的蜘蛛是必然的,所以爬虫框架能自动加载读取自定义的蜘蛛任务。每一个独立的蜘蛛应当有固定的编写模式可以让蜘蛛框架能使用。框架允许爬虫定义爬取字段的处理函数
蜘蛛任务应该包含:
- 需要爬取的url或url规则,任务可以按照url规则一层一层的爬取
- 指定解析爬取到的内容,到统一模板(存入数据库)的方法,如:解析标题,解析正文内容等
框架提供异步数据库接口供蜘蛛写入数据
项目中将框架和web项目放在一起,共用mysql处理函数
框架可以扩展一些常用的处理函数
基础包选择asyncio,aiohttp
aiohttp抓取页面的代码示例:
async def fetch(self, url, headers=None, data_type='normal', proxy=None,**kw):
try:
print("url is {} proxy {}".format(url, proxy))
async with self.session.get(url, headers=headers, proxy=proxy) as r:
print("get {} status_code is {}".format(url, r.status))
if r.status == 200:
if data_type == 'image':
data = await r.read()
else:
data = await r.text()
return data
else:
print("get {} is err: {}".format(url, r.status))
return None
except Exception as e:
print("err is {}".format(e))
return None
框架核心
主要思想是通过asyncio.Queue队列实现多任务的处理
- 自动从spider目录获取定义的蜘蛛
- 蜘蛛将要抓取的url推送到该队列
MAX_TASKS = 100
class Engine:
def __init__(self,loop=None):
self.loop = loop or asyncio.get_event_loop()
self.max_tasks = MAX_TASKS
self.conn = aiohttp.TCPConnector(family=socket.AF_INET,
verify_ssl=False,
use_dns_cache=True)
self.session = aiohttp.ClientSession(loop=self.loop, connector=self.conn)
self.q = asyncio.Queue(loop=loop, maxsize=MAX_TASKS)
self.logger = get_logger('engine')
def run(self):
self.logger.info('Spider Engine started!')
start_time = datetime.now()
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(self.crawler())
# self.session.close()
except KeyboardInterrupt:
for task in asyncio.Task.all_tasks():
task.cancel()
loop.run_forever()
finally:
end_time = datetime.now()
self.logger.info('Time usage: {}'.format(end_time - start_time))
async def crawler(self):
await self.init_spiders()
workers = [asyncio.Task(self.worker(), loop=self.loop)
for _ in range(self.max_tasks)]
await self.q.join()
for w in workers:
w.cancel()
async def worker(self):
try:
while True:
task = await self.q.get()
if isinstance(task,Spider):
await task.start()
elif isinstance(task,Seed):
data = await self.fetch(**task)
await task.callback(data)
self.q.task_done()
"""告诉队列 处理完毕"""
except asyncio.CancelledError:
pass
async def init_spiders(self):
old_path = os.path.dirname(os.path.abspath(__file__))
print('old_path',old_path)
for dirName, subdirList, fileList in os.walk('/Users/lynn/xcode/LearnPython/src/webapp/kospider/spiders'):
sys.path.append(dirName)
for fname in fileList:
if re.match(r'[_,a-z,A-Z]+.py$', fname):
mod_name = fname.split('.')[0]
mod = __import__(mod_name, globals(), locals())
for attr in dir(mod):
print(attr)
if attr != mod_name: # 定义蜘蛛的文件名必须和类名一致
continue
fn = getattr(mod, attr)
# Python如何判断fn是一个类呢?
spider = fn(self.q)
self.q.put_nowait(spider)
if len(subdirList) > 0:
subdirList = subdirList[1:]
sys.path.append(old_path)
async def fetch(self, url, headers=None, data_type='normal', proxy=None,**kw):
try:
print("url is {} proxy {}".format(url, proxy))
async with self.session.get(url, headers=headers, proxy=proxy) as r:
print("get {} status_code is {}".format(url, r.status))
if r.status == 200:
if data_type == 'image':
data = await r.read()
else:
data = await r.text()
return data
else:
print("get {} is err: {}".format(url, r.status))
return None
except Exception as e:
print("err is {}".format(e))
return None
蜘蛛代码示例
蜘蛛代码可以放置在任意位置,在初始化框架时指定
dict_filter = {
# 旅行·在路上
# '5AUzod': "电影",
# 电影
'1hjajt': "电影",
# # # 美妆·护肤·穿搭
'025246642a19': "时尚",
# # # 美食
'qqfxgN': "美食",
# # # 电竞·游戏
'0856231c8e98': "游戏",
# # # 萌宠
'88b891fe2acb': "体育",
# # # 设计
'3063e24c8622': "时尚",
# # # 运动&健身
'snqjhw': "体育",
# # 摄影
'7b2be866f564': "旅游"
}
headers_ = {
"User-Agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +",
"Referer": "http://www.jianshu.com"
}
class JianShu(Spider):
def __init__(self,queue):
self.q = queue
async def start(self):
print('JianShu Spider start')
await asyncio.sleep(0.3)
for url in dict_filter:
seed = Seed('http://www.jianshu.com/c/{}'.format(url),self.next_parse,headers=headers_)
self.q.put_nowait(seed)
print('JianShu Spider end')
async def next_parse(self,data):
print(data)
需要继续完善,主要思路就是在这里指定url及生成新url的规则和处理函数,最终处理函数将要数据写入。
下一步
- 完善蜘蛛引擎可配置
- 蜘蛛父类提供一些常用方法
- 数据写入