增加代码:
with ThreadPoolExecutor(max_workers=10) as thread_pool:
content_chapter_list = thread_pool.map(
self.__get_content_chapter, link_chapter)
使用map()方法开启多线程执行并发任务,最大线程开启数限制为10个。线程任务执行完之后,结果按参数列表的顺序保存,所以章节的顺序并不会乱。map()方法第一个参数是线程执行的方法,第二个是方法的参数。建议线程数不要设置太多,这样对服务器友好。
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import requests
import time
import sys
import re
import threading
class fiction():
__chapter_download = 0 # 已下载章节数
__chapter_total = 0 # 章节总数
def __init__(self, name, url_ws, url_lp, encode, attrs_div_lp={}, attrs_div_ct={}):
self.__name = name # 名字
self.__url_ws = url_ws # 网站url
self.__url_lp = url_lp # 链接(目录)页的url
self.__attrs_div_lp = attrs_div_lp # 链接(目录页)存放各个章节链接的div标签属性
self.__attrs_div_ct = attrs_div_ct # 章节页存放内容的div标签属性
self.__encode = encode # 指定编码格式
def Update(self, name, url_ws, url_lp, encode, attrs_div_lp={}, attrs_div_ct={}):
'''重置参数
必须同时重置所有参数,否则可能出现错误
'''
self.__name = name # 名字
self.__url_ws = url_ws # 网站url
self.__url_lp = url_lp # 链接(目录)页的url
self.__attrs_div_lp = attrs_div_lp # 链接(目录页)存放各个章节链接的div标签属性
self.__attrs_div_ct = attrs_div_ct # 章节页存放内容的div标签属性
self.__encode = encode
def __get_Link_chapter(self):
'''在目录页获得各个章节的url.
解析目录页,通过属性找到存放各个章节url的div标签,
获取各个章节的url并且返回
'''
# 当请求发生异常:连接或者超时错误,等待1S再尝试
for try_counter in range(10):
try:
req_lp = requests.get(self.__url_lp, timeout=10)
break
except ConnectionError:
print('尝试获取目录页ConnectionError:%d' % (try_counter+1))
except TimeoutError:
print('尝试获取目录页TimeoutError:%d' % (try_counter+1))
except:
print('尝试获取目录页OtherError:%d' % (try_counter+1))
time.sleep(1)
if try_counter >= 9:
print('获取目录页失败')
return
else:
try:
req_lp.encoding = req_lp.apparent_encoding
# 建立BeautifulSoup对象,指定解析器lxml
bs_lp = BeautifulSoup(req_lp.text, 'lxml')
# 找到所有对应属性的div标签
div_list = bs_lp.find_all('div', attrs=self.__attrs_div_lp)
# 找到所有的a标签
link_chapter = []
for div in div_list:
link_chapter += div.find_all('a')
return link_chapter
except TypeError:
print('目录页解析异常:TypeError')
return
# except:
# print('目录页解析异常:OtherError')
# return
def __get_content_chapter(self, link):
'''获取章节内容.
:param link:在目录页解析后得到的a标签
内含章节名和url
'''
name_chapter = link.string
url_chapter = self.__url_ws + link['href'] # 拼接得到章节页url
for try_counter in range(10):
try:
req_ct = requests.get(url_chapter, timeout=10)
break
except ConnectionError:
print('尝试获取章节链接:ConnectionError%d' % (try_counter+1))
except TimeoutError:
print('尝试获取章节链接:TimeoutError%d' % (try_counter+1))
except:
print('尝试获取章节链接:OtherError%d' % (try_counter+1))
time.sleep(1)
if try_counter >= 9:
print('获取链接失败:'+name_chapter)
content_chapter = name_chapter+'\n\n'
else:
try:
req_ct.encoding = self.__encode
bs_ct = BeautifulSoup(
req_ct.text, 'lxml')
content = bs_ct.find(
'div', attrs=self.__attrs_div_ct)
content = str(content).replace('
','\n').replace('\xa0',' ')
content = BeautifulSoup(content,'lxml').get_text()
content_chapter = name_chapter + '\n\n' + content + '\n\n'
except TypeError:
print('章节页解析异常:TypeError '+name_chapter)
content_chapter = name_chapter+'\n\n'
except:
print('章节页解析异常:OtherError '+name_chapter)
content_chapter = name_chapter+'\n\n'
self.__chapter_download += 1 # 计算章节下载数
sys.stdout.write('下载进度:%.1f%%' % float(
self.__chapter_download/self.__chapter_total*100)+'\r')
return content_chapter
def write(self, path_save):
'''写下载的文件到指定路径.
:param path_save:指定的保存路径
'''
path_save = path_save + '\\' + self.__name + '.txt'
link_chapter = self.__get_Link_chapter()
self.__chapter_total = len(link_chapter)
if link_chapter is None:
pass
else:
# 开线程池
with ThreadPoolExecutor(max_workers=10) as thread_pool:
content_chapter_list = thread_pool.map(
self.__get_content_chapter, link_chapter)
with open(path_save, 'w+', encoding=self.__encode) as file:
for content_chapter in content_chapter_list:
file.write(content_chapter)
print('<<'+self.__name+'>>下载完成')
if __name__ == '__main__':
start = time.time()
f = fiction(name='雪中悍刀行',
url_ws='http://www.xbiquge.la',
url_lp='http://www.xbiquge.la/0/745/',
attrs_div_lp={'id': 'list'},
attrs_div_ct={'id': 'content'},
encode='utf-8')
f.write(r'C:\Users\HP\Desktop\pytxt')
stop = time.time()
print('用时:%ds' % (stop-start))