小说的爬取(2)

爬取的是整个笔趣阁的多个类型下的小说,分四层进行爬取,并存储在mysql数据库中,为网站提供数据源

代码如下:

import re
import urllib.request
# 数据库的操作
import pymysql


class Sql(object):
    db = pymysql.connect(host="localhost", port=3306, db="novel", user="root", password="root", charset="utf8")
    print('连接上了!')

    def addnovel(self, sort_id, sort_name, bookname, imgurl, description, status, author):
        cur = self.db.cursor()
        cur.execute(
            'insert into novel(booktype,sortname,name,imgurl,description,status,author) values("%s","%s","%s","%s","%s","%s","%s")' % (
                sort_id, sort_name, bookname, imgurl, description, status, author))
        lastrowid = cur.lastrowid
        cur.close()
        self.db.commit()
        return lastrowid

    def addchapter(self, lastrowid, chaptname, content):
        cur = self.db.cursor()
        cur.execute('insert into chapter(novelid,title,content)value("%s","%s","%s")' % (lastrowid, chaptname, content))
        cur.close()
        self.db.commit()
#
#
mysql = Sql()


def type():  # 获取小说类型
    html = urllib.request.urlopen("https://www.duquanben.com/").read()
    html = html.decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    reg = r'fn-left(.*?)subnav'
    html = re.findall(reg, html)
    for i in html:
        html = re.findall(r'book(.*?)/0/1/">(.*?)', i)
        for sort_id, sort_name in html:
            getList(sort_id, sort_name)


def getList(sort_id, sort_name):  # 获取书的链接
    html = urllib.request.urlopen('https://www.duquanben.com/book%s/0/1/' % sort_id).read().decode('gbk')
    # print(html)
    reg = r'
  • .*?href="(.*?)" target=".*?">.*?
  • ' urlList = re.findall(reg, html) for url in urlList: # print(urlList) Novel(url, sort_id, sort_name) def Novel(url, sort_id, sort_name): html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '').replace( '
    ', '').replace(' ', '') # print(html) chapturl, bookname = re.findall( '投票推荐开始阅读', html)[0] description = re.findall(r'内容简介.*?intro.*?>(.*?)
    ', html)[0] imgurl = re.findall(r'.*?', html)[0]
    status = re.findall(r'float:right(.*?)
    ', html)[0] author = re.findall(r'作者:(.*?) 最新章节', html)[0] # print(chapturl, bookname, description, imgurl, status, author) lastrowid = mysql.addnovel(sort_id, sort_name,bookname, imgurl, description, status,author) lastrowid=1 print(chapturl, bookname,status, author) print("*"*100) chaptList(chapturl, sort_id, sort_name, lastrowid,bookname) def chaptList(chapturl,sort_id, sort_name, lastrowid, bookname): # 获取了章节的部分链接和章节的名字 html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '') # print(html) reg = r'mulu_list(.*?)show_index3' chapthtml = re.findall(reg, html) for chapt in chapthtml: chaptlist = re.findall(r'
  • (.*?)
  • ', chapt, re.S) # print(chaptlist) for url1, chaptname in chaptlist: # print(bookname+"中的"+chaptname +"爬取结束") chaptcontent(url1, chapturl, lastrowid, chaptname) def chaptcontent(url1, chapturl, lastrowid, chaptname): url = chapturl + url1 # print(url) html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '') reg = r'class="contentbox">(.*?)
    ' content = re.findall(reg, html)[0].replace('
    ', '').replace(' ', '').replace('>', '').replace('<','').replace( '[..]', '').replace('-a', '').replace('/a ', '') # print(content) mysql.addchapter(lastrowid, chaptname, content) type()

    在mysql可视乎界面中新建了两个表


    小说的爬取(2)_第1张图片
    R`_LQ8K_`{GD7~80ZMNF(UQ.png
    CREATE TABLE `chapter` (
      `id` int(20) NOT NULL AUTO_INCREMENT,
      `novelid` varchar(50) DEFAULT NULL,
      `title` varchar(50) DEFAULT NULL,
      `content` longtext,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=103 DEFAULT CHARSET=utf8;
    
    
    CREATE TABLE `novel` (
      `id` int(50) NOT NULL AUTO_INCREMENT,
      `booktype` varchar(20) DEFAULT NULL,
      `sortname` varchar(50) DEFAULT NULL,
      `name` varchar(50) DEFAULT NULL,
      `imgurl` varchar(50) DEFAULT NULL,
      `description` varchar(200) DEFAULT NULL,
      `status` varchar(50) DEFAULT NULL,
      `author` varchar(20) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8;
    
    
    

    学习网站:b站

    爬取结果如下:
    ![~FN7FKIP0T]FTAZUJLOR58.png

    ![9E@55TKMW%G0[R40$_526.png](https://upload-images.jianshu.io/upload_images/11616627-2e189951152a475d.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/540)

    你可能感兴趣的:(小说的爬取(2))