最近在学习Python爬虫,同时在公司同事的引导下接触到协程,开始学习使用协程编写异步爬虫
Python协程系列学习参考:https://blog.csdn.net/qq_27825451/article/details/86218230
主站url
url = 'http://www.quanshuwang.com/book/9/9055'
首先获取主站点页面下的所有的章节url,存入列表
主协程:将url列表分为20个一批,每一个url将创建一个task(getContent(url,session))抓取小说内容
汇总内容到allList列表中,结束爬取,写入txt
import time
import requests
from lxml import etree
import aiohttp
import asyncio
# import re
# 爬取一本小说
def getEbookUrlList():
"""
获取站点首页下的章节urlList
:return:
"""
url = 'http://www.quanshuwang.com/book/9/9055' # 小说站点首页
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'x-client-data': 'CKW1yQEIkbbJAQiktskBCMS2yQEIqZ3KAQi0n8oBCKijygEIyqXKAQixp8oBCOKoygEI8anKAQjLrsoB',
}
s = requests.session()
response = s.get(url=url, headers=headers)
response.encoding = 'gbk' # 中文的编码问题要注意
response = response.text
tree = etree.HTML(response)
myList = tree.xpath('//*[@id="chapter"]/div[3]/div[3]/ul/div[2]//li/a/@href')
return myList
async def run():
"""
主协程:创建多个getContent协程
:return:
"""
urls = getEbookUrlList()
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'x-client-data': 'CKW1yQEIkbbJAQiktskBCMS2yQEIqZ3KAQi0n8oBCKijygEIyqXKAQixp8oBCOKoygEI8anKAQjLrsoB',
}
print('Start!')
allList = []
async with aiohttp.ClientSession(headers=headers) as session:
total = len(urls)
num = 20 # 每20个url作为一批任务进行处理
times = (total - 1) // num
for i in range(times + 1): # 分批次处理任务
num1 = i * num
num2 = (i + 1) * num
urlList = urls[num1:num2]
tasks = [asyncio.ensure_future(getContent(url, session)) for url in urlList]
finished, depending = await asyncio.wait(tasks)
for task in finished:
# [title , contentList] = task.result()
allList.append(task.result()) # allList: [[章节1的title,[段落1,段落2...]],[章节2的title,[段落1,段落2...]],...]
await asyncio.sleep(1) # 完成一批休眠 1 秒,防止频繁访问带给服务器压力太大
print(f"爬取完毕,一共{len(urls)}个页面,开始写入txt")
filedir = 'Ebook'
for chapter in allList:
try:
title = chapter[0]
contentList = chapter[1]
contents = title + '\n'
except Exception as e:
print("错误:", e)
continue
for content in contentList:
contents += '\t' + content + '\n'
with open(f"{filedir}\\{title}.txt", 'w') as f:
f.write(contents)
f.close()
async def getContent(url, session):
try:
async with session.get(url, timeout=10) as response:
response = await response.read()
print(f"爬取页面{url}")
tree = etree.HTML(response)
title = tree.xpath('//*[@id="directs"]/div[1]/h1/strong/text()')[0]
contentList = tree.xpath('//*[@id="content"]/text()')
for i in range(len(contentList)):
contentList[i] = "".join(contentList[i].split()) #处理字符\xa0
# print(f'{title},{content}')
return [title, contentList] # 每一个章节返回的结果:[title,[段落1,段落2...]]
except Exception as e:
print('错误:', url, e)
await asyncio.sleep(1) # 请求失败,休眠 1 秒
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(run())
end = time.time()
print('总用时:', end - start)
# 不休眠一共用时13(s),分批处理休眠1s用时50(s)
1.当处理中文时一定会遇到的编码问题:response.encoding = ‘gbk’ # 中文的编码问题要注意
2.使用try处理抓取时可能出现的错误,页面为空,请求失败等,否则一个小错误就需要重新运行程序
3.字符"\xa0"的处理:contentList[i] = “”.join(contentList[i].split())