python多线程爬小说

同学写的代码,在这里记录一下

# coding = utf-8
"""
本脚本用于抓取笔趣阁网站的整本小说,只需要传入书本的序号,然后等待即可!
1,自动读取小说名称
2,自动重排版与去广告
3,可自定义线程数量,线程越多越快,但会消耗更多的系统资源,并且有被网站Ban的可能
4,书本号可以从1开始遍历传入,实现爬取整个网站的小说!
5,可自定义爬取的超时时间
6,可自定义页面打开失败时的重试次数
7,爬取失败的章节与对应网址会打印到控制台与最终小说的文件中,方便用户手动添加
8,爬取已经封装成函数,只要传入需要的参数并运行即可,只要遍历传入书本序号,即可爬取整个网站的小说
    爬取单个小说的调用方法如下所示:
if __name__ == '__main__':
    is_homework = False  # 是否把爬取的最终结果打印到控制台
    book_number = 6685  # 被捉取的书本的序号
    worker_num = 31  # 抓取线程数,实测21比较稳定,大于50会被ban
    max_times = 3  # 每次打开页面的最大重试次数
    timeout = 20  # 单次抓取的超时时间(秒)
    charset = "gbk"  # 页面编码
    try:
        my_main(_is_homework=is_homework, _book_number=book_number, _worker_num=worker_num, _max_times=max_times,
                _timeout=timeout, _charset=charset)
    except KeyboardInterrupt:
        Utility.del_rubbish()
"""
import atexit
import math
import os
import re
import ssl
import threading
import time
from functools import wraps
from urllib.request import urlopen, Request

from lxml import etree


# 定义计时装饰器
def fn_timer(function):
    @wraps(function)
    def function_timer(*args, **kwargs):
        t0 = time.time()
        result = function(*args, **kwargs)
        t1 = time.time()
        print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
        return result

    return function_timer


@atexit.register
def before_exit():
    Utility.del_rubbish()


class Utility:
    @staticmethod
    def my_list_cutter(start: int, end: int, *_lists: list) -> list:
        """
        自己的列表截取函数,可传入多个列表,最后以一个列表
        的格式按顺序把截取结果输出
        :param start: 开始的位置
        :param end: 结束的位置
        :param _lists: 被截取的列表,可传入多个
        :return: 截取结果,按照传入的顺序进行排列,包括开始位置,不包括结束位置
        """
        res = []
        for x in _lists:
            res_temp = []
            for index in range(start, end):
                res_temp.append(x[index])
            res.append(res_temp)
        return res

    @staticmethod
    def get_data(_path: str, _headers: dict, _char_set: str, _max_times: int, _timeout: int):
        """
        连接网络,获取数据
        :param _path: URL
        :param _headers: 请求头
        :param _char_set: 文字编码格式
        :param _max_times: 最大重试次数,如果为3,则最多尝试4次
        :param _timeout: 单次抓取的超时时间(秒)
        :return: 获取到的内容
        """
        times = 0
        req = Request(_path, headers=_headers)
        while times <= _max_times:
            times += 1
            try:
                conn = urlopen(req, timeout=_timeout)
                if conn.code == 200:
                    # 访问成功
                    data = conn.read()
                    data = data.decode(encoding=_char_set, errors='ignore')
                    return data
                else:
                    print("第%d次爬取\"%s\"时失败,状态码为%d" % (times, _path, conn.code))
            except BaseException:
                print("第%d次爬取\"%s\"时超时" % (times, _path))
        return None

    @staticmethod
    def my_xpath(data: str, tag: str) -> list:
        """
        自己的数据解析函数
        :param data: 被解析的数据
        :param tag: 锚点
        :return: 列表
        """
        html = etree.HTML(data)
        _list = html.xpath(tag)
        return _list

    @staticmethod
    def del_rubbish() -> None:
        """
        删除程序产生的临时文件
        :return: 无
        """
        for root, dirs, files in os.walk("./"):
            [os.remove("./" + arg) for arg in files if re.match("part.*\\.txt", arg)]


class MyThread(threading.Thread):
    thread_counter = 0
    working_thread = 0

    def __init__(self, path_main: str, headers: dict, _charset, _href, name_charter: list, _filename: str,
                 _max_times: int, _timeout: int):
        threading.Thread.__init__(self)
        self.path_main = path_main
        self.headers = headers
        self.href = _href
        self.charset = _charset
        self.name_charter = name_charter
        self.fileName = _filename
        self.number = MyThread.thread_counter
        self._max_times = _max_times
        self._timeout = _timeout
        MyThread.thread_counter += 1

    @fn_timer
    def run(self) -> None:
        MyThread.working_thread += 1
        print("线程%d启动,目前存活线程数:%d" % (self.number, MyThread.working_thread))
        super().run()
        self.__task()
        MyThread.working_thread -= 1
        print("线程%d结束,目前存活线程数:%d" % (self.number, MyThread.working_thread))

    def start(self):
        super().start()
        return self

    def __task(self) -> None:
        with open("./" + self.fileName + ".txt", mode="w") as file:
            for index, href_item in enumerate(self.href, 0):
                data_single = Utility.get_data(self.path_main + href_item, self.headers, self.charset, self._max_times,
                                               self._timeout)
                if data_single is None:
                    file.write("\"%s\"爬取失败\n\n" % self.name_charter[index])
                    file.write("这个王八蛋的地址是" + "\"" + href_item + "\"" + "\n\n")
                    continue
                html = etree.HTML(data_single)
                texts = html.xpath("//div[@id='content']/text()")
                text = "\n".join(texts)
                # 移除推广网址
                text = re.sub("http.*\\.html", "", text)
                # 移除广告:"请记住本书首发域名:biqiuge8.com。笔趣阁手机版阅读网址:wap.biqiuge8.com"
                text = re.sub("请记住本书首发域名.*biqiuge.*\\.com", "", text)
                # 移除广告:"手机用户请浏览m..阅读,更优质的阅读体验。"
                text = re.sub("手机用户.*阅读体验。", "", text)
                file.write(self.name_charter[index] + "\n")
                file.flush()
                file.write(text + "\n")
                file.flush()


def my_main(_book_number, _is_homework=False, _worker_num=21, _max_times=3, _timeout=20, _charset="gbk") -> None:
    """
    :param _is_homework: 是否把爬取的最终结果打印到控制台
    :param _book_number: 被捉取的书本的序号
    :param _worker_num: 抓取线程数
    :param _max_times: 每次打开页面的最大重试次数
    :param _timeout: 单次抓取的超时时间(秒)
    :param _charset: 页面编码
    :return: 无
    """
    Utility.del_rubbish()

    if _book_number is None:
        print("书本序号为空,我不知道要爬哪本书。")
        return

    if _worker_num >= 50:
        print("实测线程数超过50会被ban,是否继续?\n"
              "输入任意内容继续,输入q退出")
        temp = input()
        if temp is 'q' or temp is 'Q':
            exit(0xFC)
    ssl._create_default_https_context = ssl._create_unverified_context

    path_main = "https://www.biqiuge8.com"
    # 被捉取的完整章节目录页面
    path_entity = "%s/book/%d/" % (path_main, _book_number)
    # 请求头
    headers = {
     
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0",
    }
    # 页面编码
    data_main = Utility.get_data(path_entity, headers, _charset, 3, _timeout)
    if data_main is None:
        print("章节目录网址\"%s\"打开失败")
        exit(0xFE)
    list_main = Utility.my_xpath(data_main, "//div[@class='listmain']/dl/dd/a")
    name = [item_a.xpath("./text()")[0] for item_a in list_main]
    href = [item_a.xpath("./@href")[0] for item_a in list_main]
    book_name = Utility.my_xpath(data_main, "//div[@class='book']/div[@class='info']/h2/text()")[0]

    if len(name) != len(href):
        print("章节数与地址数不一致,请修正代码")
        exit(0xFF)

    # 用于保存线程的列表
    worker_list = []

    # 截取最新章节
    cut_res = Utility.my_list_cutter(0, 6, name, href)
    new_name = [x for x in reversed(cut_res[0])]
    new_href = [x for x in reversed(cut_res[1])]
    spec_worker = MyThread(path_main=path_main, headers=headers, _charset=_charset, _filename="part-1", _href=new_href,
                           name_charter=new_name, _max_times=_max_times,
                           _timeout=_timeout)
    worker_list.append(spec_worker.start())

    # 滤除最新章节
    cut_res = Utility.my_list_cutter(6, len(name), name, href)
    name = cut_res[0]
    href = cut_res[1]

    name_len = len(name)
    part_work_task = math.floor(name_len / _worker_num)
    """
    为每一个线程分配任务
    """
    for i in range(_worker_num - 1):
        cut_res = Utility.my_list_cutter(i * part_work_task, (i + 1) * part_work_task, name, href)
        name_part = cut_res[0]
        href_part = cut_res[1]
        worker = MyThread(path_main=path_main, headers=headers, _charset=_charset, _filename="part%d" % i,
                          _href=href_part, name_charter=name_part,
                          _max_times=_max_times,
                          _timeout=_timeout)
        worker_list.append(worker.start())
    """
    为最后一个线程分配任务
    """
    if (_worker_num - 1) * part_work_task < name_len:
        cut_res = Utility.my_list_cutter((_worker_num - 1) * part_work_task, name_len, name, href)
        name_part = cut_res[0]
        href_part = cut_res[1]
        worker = MyThread(path_main=path_main, headers=headers, _charset=_charset,
                          _filename="part%d" % (_worker_num - 1),
                          _href=href_part, name_charter=name_part,
                          _max_times=_max_times,
                          _timeout=_timeout)
        worker_list.append(worker.start())
    """
    等待线程完成
    """
    [x.join() for x in worker_list]
    """
    以下对文件进行合成
    """
    print("开始对文件进行合成")
    with open("./" + book_name + ".txt", mode="w") as resFile:
        my_range = [x for x in range(0, _worker_num - 1)]
        # 最后添加上最新章节
        my_range.append(-1)
        for i in my_range:
            try:
                temp_file_name = "./part" + str(i) + ".txt"
                temp_file = open(temp_file_name, mode="r")
                temp_content = temp_file.read()
                temp_content = re.sub("\n\n", "\n", temp_content)
                resFile.write(temp_content)
                temp_file.close()
            except FileNotFoundError:
                print("文件" + temp_file_name + "不存在")
                continue
    print("文件合成结束")
    """
    移除缓存文件
    """
    print("删除临时文件")
    Utility.del_rubbish()
    print("爬虫程序结束")
    if _is_homework:  # 如果是作业,就把爬取的最终结果读出来显示在控制台
        print("应甲方爸爸的要求打印全文")
        with open("./" + book_name + ".txt", mode="r") as resFile:
            [print(line) for line in resFile]
        print("应甲方爸爸的要求打印全文,全文结束")


if __name__ == '__main__':
    is_homework = False  # 是否把爬取的最终结果打印到控制台
    book_number = 6685  # 被捉取的书本的序号
    worker_num = 31  # 抓取线程数,实测21比较稳定,大于50会被ban
    max_times = 3  # 每次打开页面的最大重试次数
    timeout = 20  # 单次抓取的超时时间(秒)
    charset = "gbk"  # 页面编码
    try:
        my_main(_is_homework=is_homework, _book_number=book_number, _worker_num=worker_num, _max_times=max_times,
                _timeout=timeout, _charset=charset)
    except KeyboardInterrupt:
        Utility.del_rubbish()

你可能感兴趣的:(python,python)