.*?
代码:
import re, requests, aiohttp, time, random, threading, asyncio, async_timeout
url = r'http://www.xbiquge.la/13/13959/'
# 用户代理
headers = [{'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'}, {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}, {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)'}, {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}]
# 大纲正则
patternOutLine = ("(.*?)
", ".*? ")
# 章节页面正则
patternBodySet = ("(.*?)
", '(.*?).*?
')
tkList = []
# 2064
# 获得大纲链接
def getOutline(url, patternOutLine):
res = requests.get(url, headers=random.sample(headers, 1)[0])
res.encoding = res.apparent_encoding
novelName = re.findall(patternOutLine[0], res.text, re.S | re.M | re.I)[0]
print(len([url + link.split("/")[-1] for link in re.findall(patternOutLine[1], res.text, re.S|re.M|re.I)]))
novelLink = iter([url + link.split("/")[-1] for link in re.findall(patternOutLine[1], res.text, re.S|re.M|re.I)])
return novelName, novelLink
# 初始化数据库
def initDB():
import pymongo
client = pymongo.MongoClient("localhost:27017")
cursor = client['biquge']
return cursor
# 章节处理
def chapterDeal(patternBodySet, html, novelName, cursor, url):
start = time.time()
chapter = re.findall(patternBodySet[0], html, re.S|re.M|re.I)[0]
body = re.findall(patternBodySet[1], html, re.M)[0]
cursor[novelName].insert_one({"link": url, "chapter": chapter, "body": body})
print(chapter + ' 存入成功!', time.time() - start)
# 章节网页下载
async def chapterDownload(url, patternBodySet, novleName, cursor):
async with aiohttp.ClientSession() as session:
try:
start = time.time()
async with session.get(url, headers=random.sample(headers, 1)[0], timeout=15) as resp:
print('获取res用时:', time.time() - start)
tk = threading.Thread(target=chapterDeal, args=(patternBodySet, await resp.text(encoding="utf-8"), novleName, cursor, url))
tk.start()
tkList.append(tk)
print('异步时间:', time.time() - start)
except:
pass
# 主程序运行
def run(urls, patternBodySet, novelName, cursor):
loop = asyncio.get_event_loop()
tasks = [chapterDownload(url, patternBodySet, novelName, cursor) for url in urls]
loop.run_until_complete(asyncio.wait(tasks))
for tk in tkList:
tk.join()
print("运行结束!")
if __name__ == '__main__':
# novelName, novelLink = getOutline(url, patternOutLine)
import time
start = time.time()
cursor = initDB()
novelName, novelLink = getOutline(url, patternOutLine)
print("运行Outline,总用时:{}".format(start - time.time()))
run(novelLink, patternBodySet, novelName, cursor)
print("运行完毕,总用时:{}".format(start - time.time()))
运行结果:
运行Outline,总用时:-1.6277616024017334
获取res用时: 1.2178490161895752
异步时间: 1.2237696647644043
...
获取res用时: 15.794588804244995
第1257章 神话大圣决战 存入成功! 0.001337289810180664
获取res用时: 15.234058618545532
第四百三十三章 大坑货 存入成功! 0.0011713504791259766
运行结束!
运行完毕,总用时:-18.238994121551514
说明:
首先,说明,本人电脑i7处理器,也许是因为代数的关系,性能貌似不行。
在下面代码:
# 章节网页下载
async def chapterDownload(url, patternBodySet, novleName, cursor):
async with aiohttp.ClientSession() as session:
try:
start = time.time()
async with session.get(url, headers=random.sample(headers, 1)[0], timeout=15) as resp:
print('获取res用时:', time.time() - start)
tk = threading.Thread(target=chapterDeal, args=(patternBodySet, await resp.text(encoding="utf-8"), novleName, cursor, url))
tk.start()
tkList.append(tk)
中,执行async with session.get…之前,异步运行,基本是并发的,可是执行async with session.get…之后,多线程部分入口的代码运行时间稍微有点过于长了。
那么, 现在可以估计的平均运行时间为(下载的页面为268):
time = (18.238994121551514 - 1.6277616024017334)/268s=0.061s
这个运行时间有点长,直接导致了爬虫的效率直线降低,以这个数据,经过计算(原小说篇幅1400章左右),需要爬取的整本的小说花费的时间:
time = 1.6277616024017334 + 1400 * 0.061 = 87.03s
实际,将 async with session.get(url, headers=random.sample(headers, 1)[0], timeout=15) as resp:的延时timeout从15s修改为90s,爬取的小说的总篇幅为1415章,
实际小说的总篇幅为1418章,基本判定了前面计算结果的正确性,同时表明了多线程部分入口的代码这部分需要继续优化,其同步效果在某一部分存在这低效率。