Python多线程抓取笔趣阁小说

使用requests、xpath抓取

具体代码:

import os
import time
import random
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
import pymysql

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}


def NovelSpider(url):
    """
    小说抓取
    :param url:
    :return:
    """
    novel = {}

    html = requests.get(url).content
    tree = etree.HTML(html)
    print("抓取基本信息...")
    # 小说名
    novel['title'] = tree.xpath('//*[@id="info"]/h1/text()')[0]
    # 作者
    novel['author'] = tree.xpath('//*[@id="info"]/p[1]/a/text()')[0]
    # 基础信息
    novel['info'] = textHandler(tree.xpath('//*[@id="info"]/p[3]/text()')[0])
    # 简介
    novel['synopsis'] = textHandler(tree.xpath('//*[@id="intro"]/text()')[0])
    # 类别
    novel['type'] = tree.xpath('/html/body/div[2]/div[1]/text()')[2].split(">")[1].replace(" ", "")
    print("基本信息抓取完成:", novel)

    print("抓取章节信息...")
    novel['sections'] = []
    # 爬取基础章节
    sections = tree.xpath('//*[@id="list"]/dl/dd/a')
    sections_urls = tree.xpath('//*[@id="list"]/dl/dd/a/@href')

    for (k,v) in zip(sections, sections_urls):
        # 获取内容
        print("正在抓取:",k.text, url + v)
        novel_content = requests.get(url + v).content
        novel_tree = etree.HTML(novel_content)
        content = novel_tree.xpath('//*[@id="content"]/text()')
        content_all = ""
        for i in content:
            content_all += i

        # 章节信息,章节信息是一个对象,里面包括章节名,章节url,章节内容详情
        novel['sections'].append({"sectionsTitle": k.text, "sectionsUrl": url + v, "sectionsContent": content_all})
        print("Successful:", k.text)
        # 暂停0.2s 防止反爬
        time.sleep(0.5)

    return novel

def textHandler(text):
    """
    字符串处理
    :param text:
    :return:
    """
    return text.replace("\r\n", "").replace("\t", "").replace(" ", "").replace("\xa0", " ")

def main(i):
    # 打开数据库连接
    conn = pymysql.connect('localhost', user="root", passwd="root", db="readminiprogram")
    # 获取游标
    cursor = conn.cursor()
    url = base_url + str(i) + '/'
    print("正在抓取, 第", i, "本小说, URL=", url)
    novel = NovelSpider(url)  # 该方法返回一个novel对象,里面包括

    # 存信息
    sql_novel = "INSERT INTO novel_test (novelTitle, author, baseInfo, novelSynopsis, novelType) VALUES (%s, %s, %s, %s, %s)"
    val_novel = (novel['title'], novel['author'], novel['info'], novel['synopsis'], novel['type'])
    cursor.execute(sql_novel, val_novel)

    # 查询当前小说ID
    cursor.execute("SELECT * FROM novel_test WHERE novelTitle = %s", novel['title'])
    db_novel = cursor.fetchone()
    print("已经存入,", db_novel)

    # 存章节
    for j in novel['sections']:
        sql_novel_chapter = "INSERT INTO chapter_test (novelid_test, chapterTitle, chapterUrl, chapterContent) VALUES (%s, %s, %s, %s)"
        val_novel_chapter = (db_novel[0], j['sectionsTitle'], j['sectionsUrl'], j['sectionsContent'])
        cursor.execute(sql_novel_chapter, val_novel_chapter)
        print(j['sectionsTitle'], ", 已保存...")
    cursor.close()
    conn.commit()
    conn.close()

if __name__ == "__main__":

    # 线程池, 多线程抓取小说
    thread_executor = ThreadPoolExecutor(500)

    # 可以将下面转为多线程的方式抓取
    base_url = 'https://www.52bqg.com/book_'
    for i in range(4, 120000):
        try:
            # 多线程抓取
            thread_executor.submit(main, i)
        except BaseException as e:
            print(e)
    thread_executor.shutdown(True)

数据库sql:
Python多线程抓取笔趣阁小说_第1张图片
Python多线程抓取笔趣阁小说_第2张图片

抓取具体结果:
Python多线程抓取笔趣阁小说_第3张图片
Python多线程抓取笔趣阁小说_第4张图片

你可能感兴趣的:(【20】Python)