python小说爬虫 requests+pyquery+多线程
import requests
from pyquery import PyQuery as pq
from concurrent.futures import ThreadPoolExecutor
def download(url, encoding="utf-8"):
try:
response = requests.get(url)
response.encoding = encoding
if response:
return response.text
except Exception as e:
print(e)
return ""
def getContent(url, chapterName):
doc = pq(download(url))
content = doc("#content")
print(chapterName + "下载完毕")
return chapterName + "\n" + str(content.text()).replace("\n\n", "\n").replace("\n。", "")
def getNovel(chapters_list_url):
soup = pq(download(chapters_list_url))
info = soup("#info")
novel = info.find("h1").text() + "\n" + info.find("p").eq(0).text() + "\n"
chapters = soup("#list")
executor = ThreadPoolExecutor(max_workers=5)
items = list(chapters.items("a"))[12:]
content_list = []
for a in items:
title = a.text()
url = "https://www.biquge.tw" + a.attr("href")
work = executor.submit(getContent, url + a.attr("href"), title)
content_list.append(work)
for work in content_list:
novel += work.result() + "\n"
executor.shutdown()
return novel
def saveNovel(content, path):
with open(path, "w", encoding="utf-8") as f:
f.write(content)
f.flush()
f.close()
if __name__ == '__main__':
novel = getNovel("https://www.biquge.tw/509_509388/")
saveNovel(novel, novel[:novel.find("\n")]+".txt")