爬虫和django结合开发

            Python爬虫与django框架开发小说网站

第三方包:mysqldb,django1.10.8

 

Python版本2.7

 

先写python爬虫爬取全书网所有小说的书名,章节,内容等到mysql数据库中,然后再将python爬虫爬取的小说内容在django框架中显示。

 

建库novel,再建表

 

Novelcopy(models.Model):
    novelid = models.AutoField(primary_key=True)
    sort = models.CharField(max_length=100, blank=True, null=True)
    novelname = models.CharField(max_length=100, blank=True, null=True)
    novelintroduction = models.TextField(blank=True, null=True)
    toppicture = models.CharField(max_length=50, blank=True, null=True)
    author = models.CharField(max_length=50, blank=True, null=True)
    novelimge = models.CharField(max_length=255, blank=True, null=True)

 

 

Chaptercopy(models.Model):
    chapterid = models.AutoField(primary_key=True)
    novelid = models.ForeignKey('Novelcopy', models.DO_NOTHING, db_column='novelid', blank=True, null=True)  #外键与'Novelcopy'中的novelid关联
    toppicture = models.CharField(max_length=100, blank=True, null=True)
    content = models.TextField(blank=True, null=True)
    chaptername = models.CharField(max_length=100, blank=True, null=True)
    novelname = models.CharField(max_length=200, blank=True, null=True)

 

1. 先写python爬虫爬取全书网所有小说的书名,章节,内容等到mysql数据库中。

import urllib2
import re
import MySQLdb
class Sql(object):
    ''' 连接数据库'''
    
conn = MySQLdb.connect(
        #如果发现运行错误与编码有关,可将注释全部删除再运行
        # hots="127.0.0.1",
        
port=3306,       #端口名
        
user='root',    #用户名
        
passwd='123456',  #密码
        
db='novel',       #数据库名
        
charset='utf8')    #选择数据库编码格式

 

 

#将小说的类别,介绍,名字,作者,连载信息,小说封面照片存入AddNovelcopy表中
def AddNovelcopy(self,sort,novelintroduction,novelname,author,toppicture,novelimge):
    cur = self.conn.cursor()
    cur.execute("insert into novelcopy(sort,novelintroduction,novelname,author,toppicture,novelimge) values('%s' ,'%s','%s','%s','%s','%s')"%(sort,novelintroduction,novelname,author,toppicture,novelimge))
    lastrowid = cur.lastrowid
    cur.close()
    self.conn.commit( )
    return lastrowid

 

 

#将小说id,连载信息,内容,章节名字,小说名字存入AddChaptername表中
def AddChaptername(self,novelid,toppicture,content,chaptername,novelname):
    cur = self.conn.cursor()
    cur.execute("insert into chaptercopy(novelid,toppicture,content,chaptername,novelname) values('%s' ,'%s','%s','%s','%s')" % (novelid,toppicture,content,chaptername,novelname))
    # lastrowid = cur.lastrowid
    
cur.close()
    self.conn.commit()

 

 

#小说网页头信息
headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}

 

 

def GetNovelname(pn=1):
    req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn)  # 实例将要请求的对象
    
req.headers = headers #替换所有头信息
    # req.add_header()#添加单个头信息
    
res =urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  #decode 解码 ,解码成unicode
    # return html
    
reg = r'(.*?)'
    
reg = re.compile(reg)#增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg,html)#以可迭代方式返回(.*?)中的内容

 

 

def GetImg(pn=1):
    req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    # req.add_header()#添加单个头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
    # return html
    
reg = r'.*?'
    
reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg, html)#以可迭代方式返回(.*?)中的内容

 

 

def GetSort(pn=1):
    req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
    # return html
    
reg = r'(.*?)'
    
reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg,html)

 

 

 

def Gettoppicture(pn=1):
    req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
    # return html
    
''''''
    
reg = r''
    
reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg, html)

 

 

def Get_Author(pn=1):
    req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
    # return html
    
reg = r'(.*?)'
    
reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg, html)

 

 

def GetNovelIntroduction(pn=1):
    req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
    # return html
    
reg = r'(.*?).*?'
    
reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg, html)

 

 

 

def Get_Nothing(url):
    req = urllib2.Request(url)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    # req.add_header()#添加单个头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
    # return html
    
reg = r'.*?'
    
reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
    
return re.findall(reg, html)

 

 

 

def Get_Chaptername_Chapterhref(url):
    req = urllib2.Request(url)  # 实例将要请求的对象
    
req.headers = headers  # 替换所有头信息
    # req.add_header()#添加单个头信息
    
res = urllib2.urlopen(req)
    html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
    # return html
    
reg = r'

  • (.*?)
  • '
        
    reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
        
    return re.findall(reg, html)

     

     

    def Get_Content(url):
        req = urllib2.Request(url)  # 实例将要请求的对象
        
    req.headers = headers  # 替换所有头信息
        # req.add_header()#添加单个头信息
        
    res = urllib2.urlopen(req)
        html = res.read().decode('gbk').encode('utf8')  # decode 解码 ,解码成unicode
        # return html
        
    reg = r'(.*?)
    '

        
    reg = re.compile(reg)  # 增加匹配效率,正则匹配返回的类型为list
        
    return re.findall(reg, html)

     

    mysql = Sql() #实例化一个Sql()类对象

     

     

    #主程序中标记部分为测试用的,测试前最好将插表语句给kill(注释)掉

    if __name__ == "__main__":
        for i in range(5,12):
            for (novelhref,novelname),(author),(novelintroduction),(novelimge) in zip(GetNovelname(i),Get_Author(i),GetNovelIntroduction(i),GetImg(i)) :
                for sort in GetSort(i):
                    for toppicture in Gettoppicture(i):
                         if toppicture == '/kukuku/images/only2.png':
                            toppicture = '连载中'
                         else
    :
                            toppicture = '完结'
                
    # lastrowid=mysql.AddNovelcopy(novelname=novelname, novelintroduction=novelintroduction, sort=sort,author=author, toppicture=toppicture, novelimge=str(novelimge))
                # print('-------------------------------------------------------------------------------------------')
                # mysql.AddNovelcopy(novelname=novelname, novelintroduction=novelintroduction, sort=sort,author=author, toppicture=toppicture, novelimge=str(novelimge))
                
    for url in Get_Nothing(novelhref):
                    # print('-------------------------------------url------------------------------------------------')
                    
    for chapterhref, chaptername in Get_Chaptername_Chapterhref(url):
                        con = ''
                        for
    content in Get_Content(chapterhref):
                            con = con + content
                        # print(con)
                        # print('skjfjsfgsgfjksaj')
                        # mysql.AddChaptername(novelid=lastrowid,toppicture=toppicture,content= con,chaptername= chaptername,novelname= novelname)

     

     

    如果程序运行没有报错则成功

     

    2. 接下来将写django框架,由于django不适合全部贴出来,所以只捡主要的东西贴出来。

    这是django框架所有的文件

     

     

     

    (1)setting.py  中的主要配置

     


    INSTALLED_APPS = [
        'django.contrib.admin',
        'django.contrib.auth',
        'django.contrib.contenttypes',
        'django.contrib.sessions',
        'django.contrib.messages',
        'django.contrib.staticfiles',
        'novel.apps.NovelConfig',#加上你添加的app视图
    ]

     

    #数据库配置
    DATABASES = {
        'default': {
            'ENGINE': 'django.db.backends.mysql',  #连接mysql
            
    'NAME': 'novel',  #库名
            
    'USER':'root',    #用户名
            
    'PASSWORD':'123456',  #密码
            
    'HOST':'127.0.0.1',   #本机域名
            
    'PORT':'3306',     #端口号
        
    }
    }

     

     

    Urls.py中的主要配置

     

    from django.conf.urls import url
    from django.contrib import admin
    from novel import views

    urlpatterns = [
        url(r'^admin/', admin.site.urls),  #django自带的管理员界面
        
    url(r'^$',views.index),        #小说网站首页显示的视图
        
    url(r'^book_(?P\d+)/$',views.content,name='content'),#鼠标点击,从前端获取novelid响应相应的内容
        
    url(r'^book_(\d+)/kan_(?P\d+)/$',views.contents),#鼠标点击,从前端获取novelid响应相应的内容
        
    url(r'^book_(\d+)/kan_(?P\d+)/xsc_(?P\d+)/$',views.chaptercontent)#鼠标点击,从前端获取novelid与chapterid响应相应的内容
    ]

     

     

    Models.py中的代码:

     

     

    打开cmd,cd到你的工程文档执行python2(不行的话试试python)

     

    manage.py inspectdb > novel/models.py。将mysql中的novel库中的内容导入models.py文件

     

     

    中自动生成以下类(建议如果启动服务报错,将下面类中的False改成Ture)

     

     

    from __future__ import unicode_literals

    from django.db import models


    class Chaptercopy(models.Model):
        chapterid = models.AutoField(primary_key=True)
        novelid = models.ForeignKey('Novelcopy', models.DO_NOTHING, db_column='novelid', blank=True, null=True)
        toppicture = models.CharField(max_length=100, blank=True, null=True)
        content = models.TextField(blank=True, null=True)
        chaptername = models.CharField(max_length=100, blank=True, null=True)
        novelname = models.CharField(max_length=200, blank=True, null=True)

        class Meta:
            managed = True
            db_table = 'chaptercopy'


    class
    DjangoMigrations(models.Model):
        app = models.CharField(max_length=255)
        name = models.CharField(max_length=255)
        applied = models.DateTimeField()

        class Meta:
            managed = False
            db_table = 'django_migrations'


    class
    Novelcopy(models.Model):
        novelid = models.AutoField(primary_key=True)
        sort = models.CharField(max_length=100, blank=True, null=True)
        novelname = models.CharField(max_length=100, blank=True, null=True)
        novelintroduction = models.TextField(blank=True, null=True)
        toppicture = models.CharField(max_length=50, blank=True, null=True)
        author = models.CharField(max_length=50, blank=True, null=True)
        novelimge = models.CharField(max_length=255, blank=True, null=True)

        class Meta:
            managed = True
            db_table = 'novelcopy'

     

     

    Views.py中的配置

     

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    from django.shortcuts import render
    from models import Novelcopy
    from django.http import HttpResponse
    from models import Chaptercopy

    def index(request):#视图函数
        
    novels = Novelcopy.objects.order_by('?')[:12]
        content = {
            'novels':novels[:6],
            'novelss':novels[6:]
        }
        return render(request, 'index.html',context=content)


    def content(request,novelid):
        novel = Novelcopy.objects.filter(novelid=novelid).first()
        chapters = Chaptercopy.objects.filter(novelid=novelid).all()
        chapterss = chapters[:8]
        content = {
            'novel':novel,
            'chpaterss':chapterss
        }
        return render(request,'content.html',context=content)



    def contents(request,novelid):
        novelsss = Novelcopy.objects.filter(novelid=novelid).first()
        chapters = Chaptercopy.objects.filter(novelid=novelid).filter().all()
        contents = {
            'chapters' :chapters,
            'novels':novelsss
        }
        return render(request,'chapter.html',context=contents)

    def chaptercontent(request,chapterid,novelid):
        chaptercontent = Chaptercopy.objects.filter(chapterid=chapterid).first()
        chapteraa = Novelcopy.objects.filter(novelid=novelid).first()
        content = {
            'chaptercontent':chaptercontent,
            'novels':chapteraa
        }
        return render(request,'chaptercontent.html',context=content)

     

    大致就是这样了,当然,在做的过程中还有很多还有很多不方便展示的小问题,这些都欢迎各位博友来一起探讨。

     

     

    你可能感兴趣的:(django,python)