多线程爬取小说网站——全站架构完全爬取

多线程爬取小说网站“全书网”
1、可爬取所有分类的文章
2、包括小说的封面、作者、介绍以及每章的内容,就是说这个小说网站架构都爬下来了。
3、本来是打算都爬下来的,后来发现我还是太年轻,一本书的内容就占了3到4M的数据库空间,爬到300多本以后才恍然大悟,要知道全站至少十几万本书。这你可以算一下要多少空间,我这小小服务器承担不起了。后来就改成先爬所有小说的封面、作者、书名、介绍以及链接。省略了章节具体内容后总共爬了15万3千多本。。。
 

具体代码如下:

# coding:utf8
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
import pymysql
from bs4 import BeautifulSoup
import _thread
import time
import threading

# 获取书简介,修改书信息
def getIntroduce(novel_href,id):
    header = {
        'Host':'www.quanshuwang.com',
        'Upgrade-Insecure-Requests':'1',
        'Connection':'keep-alive',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'max-age=0',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
    time.sleep(0.2)
    novellist = requests.get(novel_href,headers=header,timeout=20)
    novellist.encoding = 'gbk'
    soup = BeautifulSoup(novellist.text, 'lxml')
    res = soup.select("#waa")
    if(len(res)>0):

        # 书简介
        introduce = soup.select("#waa")[0].get_text()

        chapterHref = soup.select(".reader")[0].get("href")
        print(introduce)

        sql = "UPDATE novel_info SET novel_introduce='%s' WHERE novel_href='%s'" % (introduce,novel_href)

        te = threading.Thread(target=getChapterList, args=(chapterHref, id,sql))
        te.start()
        # getChapterList(chapterHref,id,sql)

# 获取章节信息
def getChapterList(h,id,sql):
    db = pymysql.connect("localhost", "root", "123456", "wx_app")
    db.ping(True)
    time.sleep(0.2)
    novellist = requests.get(h, timeout=20)
    novellist.encoding = 'gbk'
    soup = BeautifulSoup(novellist.text, 'lxml')
    list = soup.select(".dirconone > li");
    i = 1
    print("开始输入-> 书ID:%d " % id)
    insertNovelInfo(sql,db)
    for chapter in list:
        contHref = chapter.select("a")[0].get("href")
        # 章节标题
        contTitle = chapter.select("a")[0].get_text()
        # content = getContents(contHref)

        res1 = requests.get(contHref, timeout=20)
        res1.encoding = 'gbk'
        soup = BeautifulSoup(res1.text, 'lxml')
        tx = soup.select(".mainContenr")

        if (len(tx) > 0):
            content = soup.select(".mainContenr")[0].get_text().lstrip('style5();').rstrip('style6();')
        else:
            content = h

        print("章节:%s" % (contTitle))

        sql1 = "INSERT INTO `novel_chapter`(novel_id,chapter_id,chapter_name) VALUES(%d,%d,'%s')" % (id,i,contTitle)
        i = i+1
        sql2 = "INSERT INTO `novel_chapter_info`(chapter_id,chapter_name,chapter_content,novel_id) VALUES(%d,'%s','%s',%d)" % (i,contTitle,content,id)

        insertNovelInfo(sql1,db)
        insertNovelInfo(sql2,db)

    print("文件%s输入完成" % id)
    db.commit()
    db.close()
def getContents(h):
    res = requests.get(h, timeout=20)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml')

    tx = soup.select(".mainContenr")

    if(len(tx)>0):
        content = soup.select(".mainContenr")[0].get_text().lstrip('style5();').rstrip('style6();')
    else:
        content = h
    return content

def insertNovelInfo(sql,db):
    cursor = db.cursor()
    try:

        cursor.execute(sql)

    except:
        #回滚
        db.rollback()
        print("mysql错误:",sql)
        exec(0)

# getIntroduce('http://www.quanshuwang.com/book_135083.html')

def test(i):
    print(i)

def init(count,num):
    # count = 0
    while (count < num):
        i = 0
        str = "select a.novel_href,a.novel_id from novel_info a inner join (select novel_id from novel_info GROUP BY novel_id  limit %d,1) b on a.novel_id=b.novel_id " % count

        # 打开数据库连接
        db = pymysql.connect("localhost", "root", "123456", "wx_app")
        db.ping(True)
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()

        try:
            # 执行SQL语句
            cursor.execute(str)
            # 获取所有记录列表
            results = cursor.fetchall()
            db.close()

        except:
            print("Error: unable to fecth data")
        # 关闭数据库连接

        for row in results:
            getIntroduce(row[0],row[1])
            print(row[0],row[1])
        count = count + 1

def test(res):
    i = 0;
    while(i<10):
        print(res)
        i = i+1

try:
    threads = []
    # 循环开启线程
    for i in range(0, 100):
        # 每个线程执行多少文章
        j = i+1
        t = threading.Thread(target=init, args=(i,j))
        i = j
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    print("end")

except:
    print("Error: 无法启动线程")

里面的函数具体用起来要根据自己需求来组合,如有问题或者需要其它数据库资料可关注我个人微信公众号:yyjuan,发送本文链接获取相关资料

多线程爬取小说网站——全站架构完全爬取_第1张图片

你可能感兴趣的:(爬虫系列)