同学写的代码,在这里记录一下
# coding = utf-8
"""
本脚本用于抓取笔趣阁网站的整本小说,只需要传入书本的序号,然后等待即可!
1,自动读取小说名称
2,自动重排版与去广告
3,可自定义线程数量,线程越多越快,但会消耗更多的系统资源,并且有被网站Ban的可能
4,书本号可以从1开始遍历传入,实现爬取整个网站的小说!
5,可自定义爬取的超时时间
6,可自定义页面打开失败时的重试次数
7,爬取失败的章节与对应网址会打印到控制台与最终小说的文件中,方便用户手动添加
8,爬取已经封装成函数,只要传入需要的参数并运行即可,只要遍历传入书本序号,即可爬取整个网站的小说
爬取单个小说的调用方法如下所示:
if __name__ == '__main__':
is_homework = False # 是否把爬取的最终结果打印到控制台
book_number = 6685 # 被捉取的书本的序号
worker_num = 31 # 抓取线程数,实测21比较稳定,大于50会被ban
max_times = 3 # 每次打开页面的最大重试次数
timeout = 20 # 单次抓取的超时时间(秒)
charset = "gbk" # 页面编码
try:
my_main(_is_homework=is_homework, _book_number=book_number, _worker_num=worker_num, _max_times=max_times,
_timeout=timeout, _charset=charset)
except KeyboardInterrupt:
Utility.del_rubbish()
"""
import atexit
import math
import os
import re
import ssl
import threading
import time
from functools import wraps
from urllib.request import urlopen, Request
from lxml import etree
# 定义计时装饰器
def fn_timer(function):
@wraps(function)
def function_timer(*args, **kwargs):
t0 = time.time()
result = function(*args, **kwargs)
t1 = time.time()
print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
return result
return function_timer
@atexit.register
def before_exit():
Utility.del_rubbish()
class Utility:
@staticmethod
def my_list_cutter(start: int, end: int, *_lists: list) -> list:
"""
自己的列表截取函数,可传入多个列表,最后以一个列表
的格式按顺序把截取结果输出
:param start: 开始的位置
:param end: 结束的位置
:param _lists: 被截取的列表,可传入多个
:return: 截取结果,按照传入的顺序进行排列,包括开始位置,不包括结束位置
"""
res = []
for x in _lists:
res_temp = []
for index in range(start, end):
res_temp.append(x[index])
res.append(res_temp)
return res
@staticmethod
def get_data(_path: str, _headers: dict, _char_set: str, _max_times: int, _timeout: int):
"""
连接网络,获取数据
:param _path: URL
:param _headers: 请求头
:param _char_set: 文字编码格式
:param _max_times: 最大重试次数,如果为3,则最多尝试4次
:param _timeout: 单次抓取的超时时间(秒)
:return: 获取到的内容
"""
times = 0
req = Request(_path, headers=_headers)
while times <= _max_times:
times += 1
try:
conn = urlopen(req, timeout=_timeout)
if conn.code == 200:
# 访问成功
data = conn.read()
data = data.decode(encoding=_char_set, errors='ignore')
return data
else:
print("第%d次爬取\"%s\"时失败,状态码为%d" % (times, _path, conn.code))
except BaseException:
print("第%d次爬取\"%s\"时超时" % (times, _path))
return None
@staticmethod
def my_xpath(data: str, tag: str) -> list:
"""
自己的数据解析函数
:param data: 被解析的数据
:param tag: 锚点
:return: 列表
"""
html = etree.HTML(data)
_list = html.xpath(tag)
return _list
@staticmethod
def del_rubbish() -> None:
"""
删除程序产生的临时文件
:return: 无
"""
for root, dirs, files in os.walk("./"):
[os.remove("./" + arg) for arg in files if re.match("part.*\\.txt", arg)]
class MyThread(threading.Thread):
thread_counter = 0
working_thread = 0
def __init__(self, path_main: str, headers: dict, _charset, _href, name_charter: list, _filename: str,
_max_times: int, _timeout: int):
threading.Thread.__init__(self)
self.path_main = path_main
self.headers = headers
self.href = _href
self.charset = _charset
self.name_charter = name_charter
self.fileName = _filename
self.number = MyThread.thread_counter
self._max_times = _max_times
self._timeout = _timeout
MyThread.thread_counter += 1
@fn_timer
def run(self) -> None:
MyThread.working_thread += 1
print("线程%d启动,目前存活线程数:%d" % (self.number, MyThread.working_thread))
super().run()
self.__task()
MyThread.working_thread -= 1
print("线程%d结束,目前存活线程数:%d" % (self.number, MyThread.working_thread))
def start(self):
super().start()
return self
def __task(self) -> None:
with open("./" + self.fileName + ".txt", mode="w") as file:
for index, href_item in enumerate(self.href, 0):
data_single = Utility.get_data(self.path_main + href_item, self.headers, self.charset, self._max_times,
self._timeout)
if data_single is None:
file.write("\"%s\"爬取失败\n\n" % self.name_charter[index])
file.write("这个王八蛋的地址是" + "\"" + href_item + "\"" + "\n\n")
continue
html = etree.HTML(data_single)
texts = html.xpath("//div[@id='content']/text()")
text = "\n".join(texts)
# 移除推广网址
text = re.sub("http.*\\.html", "", text)
# 移除广告:"请记住本书首发域名:biqiuge8.com。笔趣阁手机版阅读网址:wap.biqiuge8.com"
text = re.sub("请记住本书首发域名.*biqiuge.*\\.com", "", text)
# 移除广告:"手机用户请浏览m..阅读,更优质的阅读体验。"
text = re.sub("手机用户.*阅读体验。", "", text)
file.write(self.name_charter[index] + "\n")
file.flush()
file.write(text + "\n")
file.flush()
def my_main(_book_number, _is_homework=False, _worker_num=21, _max_times=3, _timeout=20, _charset="gbk") -> None:
"""
:param _is_homework: 是否把爬取的最终结果打印到控制台
:param _book_number: 被捉取的书本的序号
:param _worker_num: 抓取线程数
:param _max_times: 每次打开页面的最大重试次数
:param _timeout: 单次抓取的超时时间(秒)
:param _charset: 页面编码
:return: 无
"""
Utility.del_rubbish()
if _book_number is None:
print("书本序号为空,我不知道要爬哪本书。")
return
if _worker_num >= 50:
print("实测线程数超过50会被ban,是否继续?\n"
"输入任意内容继续,输入q退出")
temp = input()
if temp is 'q' or temp is 'Q':
exit(0xFC)
ssl._create_default_https_context = ssl._create_unverified_context
path_main = "https://www.biqiuge8.com"
# 被捉取的完整章节目录页面
path_entity = "%s/book/%d/" % (path_main, _book_number)
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0",
}
# 页面编码
data_main = Utility.get_data(path_entity, headers, _charset, 3, _timeout)
if data_main is None:
print("章节目录网址\"%s\"打开失败")
exit(0xFE)
list_main = Utility.my_xpath(data_main, "//div[@class='listmain']/dl/dd/a")
name = [item_a.xpath("./text()")[0] for item_a in list_main]
href = [item_a.xpath("./@href")[0] for item_a in list_main]
book_name = Utility.my_xpath(data_main, "//div[@class='book']/div[@class='info']/h2/text()")[0]
if len(name) != len(href):
print("章节数与地址数不一致,请修正代码")
exit(0xFF)
# 用于保存线程的列表
worker_list = []
# 截取最新章节
cut_res = Utility.my_list_cutter(0, 6, name, href)
new_name = [x for x in reversed(cut_res[0])]
new_href = [x for x in reversed(cut_res[1])]
spec_worker = MyThread(path_main=path_main, headers=headers, _charset=_charset, _filename="part-1", _href=new_href,
name_charter=new_name, _max_times=_max_times,
_timeout=_timeout)
worker_list.append(spec_worker.start())
# 滤除最新章节
cut_res = Utility.my_list_cutter(6, len(name), name, href)
name = cut_res[0]
href = cut_res[1]
name_len = len(name)
part_work_task = math.floor(name_len / _worker_num)
"""
为每一个线程分配任务
"""
for i in range(_worker_num - 1):
cut_res = Utility.my_list_cutter(i * part_work_task, (i + 1) * part_work_task, name, href)
name_part = cut_res[0]
href_part = cut_res[1]
worker = MyThread(path_main=path_main, headers=headers, _charset=_charset, _filename="part%d" % i,
_href=href_part, name_charter=name_part,
_max_times=_max_times,
_timeout=_timeout)
worker_list.append(worker.start())
"""
为最后一个线程分配任务
"""
if (_worker_num - 1) * part_work_task < name_len:
cut_res = Utility.my_list_cutter((_worker_num - 1) * part_work_task, name_len, name, href)
name_part = cut_res[0]
href_part = cut_res[1]
worker = MyThread(path_main=path_main, headers=headers, _charset=_charset,
_filename="part%d" % (_worker_num - 1),
_href=href_part, name_charter=name_part,
_max_times=_max_times,
_timeout=_timeout)
worker_list.append(worker.start())
"""
等待线程完成
"""
[x.join() for x in worker_list]
"""
以下对文件进行合成
"""
print("开始对文件进行合成")
with open("./" + book_name + ".txt", mode="w") as resFile:
my_range = [x for x in range(0, _worker_num - 1)]
# 最后添加上最新章节
my_range.append(-1)
for i in my_range:
try:
temp_file_name = "./part" + str(i) + ".txt"
temp_file = open(temp_file_name, mode="r")
temp_content = temp_file.read()
temp_content = re.sub("\n\n", "\n", temp_content)
resFile.write(temp_content)
temp_file.close()
except FileNotFoundError:
print("文件" + temp_file_name + "不存在")
continue
print("文件合成结束")
"""
移除缓存文件
"""
print("删除临时文件")
Utility.del_rubbish()
print("爬虫程序结束")
if _is_homework: # 如果是作业,就把爬取的最终结果读出来显示在控制台
print("应甲方爸爸的要求打印全文")
with open("./" + book_name + ".txt", mode="r") as resFile:
[print(line) for line in resFile]
print("应甲方爸爸的要求打印全文,全文结束")
if __name__ == '__main__':
is_homework = False # 是否把爬取的最终结果打印到控制台
book_number = 6685 # 被捉取的书本的序号
worker_num = 31 # 抓取线程数,实测21比较稳定,大于50会被ban
max_times = 3 # 每次打开页面的最大重试次数
timeout = 20 # 单次抓取的超时时间(秒)
charset = "gbk" # 页面编码
try:
my_main(_is_homework=is_homework, _book_number=book_number, _worker_num=worker_num, _max_times=max_times,
_timeout=timeout, _charset=charset)
except KeyboardInterrupt:
Utility.del_rubbish()