Python爬虫与django框架开发小说网站
第三方包:mysqldb,django1.10.8
Python版本2.7
先写python爬虫爬取全书网所有小说的书名,章节,内容等到mysql数据库中,然后再将python爬虫爬取的小说内容在django框架中显示。
建库novel,再建表
Novelcopy(models.Model):
novelid = models.AutoField(primary_key=True)
sort = models.CharField(max_length=100, blank=True, null=True)
novelname = models.CharField(max_length=100, blank=True, null=True)
novelintroduction = models.TextField(blank=True, null=True)
toppicture = models.CharField(max_length=50, blank=True, null=True)
author = models.CharField(max_length=50, blank=True, null=True)
novelimge = models.CharField(max_length=255, blank=True, null=True)
Chaptercopy(models.Model):
chapterid = models.AutoField(primary_key=True)
novelid = models.ForeignKey('Novelcopy', models.DO_NOTHING, db_column='novelid', blank=True, null=True) #外键与'Novelcopy'中的novelid关联
toppicture = models.CharField(max_length=100, blank=True, null=True)
content = models.TextField(blank=True, null=True)
chaptername = models.CharField(max_length=100, blank=True, null=True)
novelname = models.CharField(max_length=200, blank=True, null=True)
1. 先写python爬虫爬取全书网所有小说的书名,章节,内容等到mysql数据库中。
import urllib2
import re
import MySQLdb
class Sql(object):
''' 连接数据库'''
conn = MySQLdb.connect(
#如果发现运行错误与编码有关,可将注释全部删除再运行
# hots="127.0.0.1",
port=3306, #端口名
user='root', #用户名
passwd='123456', #密码
db='novel', #数据库名
charset='utf8') #选择数据库编码格式
#将小说的类别,介绍,名字,作者,连载信息,小说封面照片存入AddNovelcopy表中
def AddNovelcopy(self,sort,novelintroduction,novelname,author,toppicture,novelimge):
cur = self.conn.cursor()
cur.execute("insert into novelcopy(sort,novelintroduction,novelname,author,toppicture,novelimge) values('%s' ,'%s','%s','%s','%s','%s')"%(sort,novelintroduction,novelname,author,toppicture,novelimge))
lastrowid = cur.lastrowid
cur.close()
self.conn.commit( )
return lastrowid
#将小说id,连载信息,内容,章节名字,小说名字存入AddChaptername表中
def AddChaptername(self,novelid,toppicture,content,chaptername,novelname):
cur = self.conn.cursor()
cur.execute("insert into chaptercopy(novelid,toppicture,content,chaptername,novelname) values('%s' ,'%s','%s','%s','%s')" % (novelid,toppicture,content,chaptername,novelname))
# lastrowid = cur.lastrowid
cur.close()
self.conn.commit()
#小说网页头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
def GetNovelname(pn=1):
req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn) # 实例将要请求的对象
req.headers = headers #替换所有头信息
# req.add_header()#添加单个头信息
res =urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') #decode 解码 ,解码成unicode
# return html
reg = r'(.*?)'
reg = re.compile(reg)#增加匹配效率,正则匹配返回的类型为list
return re.findall(reg,html)#以可迭代方式返回(.*?)中的内容
def GetImg(pn=1):
req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
# req.add_header()#添加单个头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r''
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)#以可迭代方式返回(.*?)中的内容
def GetSort(pn=1):
req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r'(.*?)'
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg,html)
def Gettoppicture(pn=1):
req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
''''''
reg = r''
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)
def Get_Author(pn=1):
req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r'(.*?)'
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)
def GetNovelIntroduction(pn=1):
req = urllib2.Request('http://www.quanshuwang.com/list/%s_1.html' % pn) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r'(.*?).*?'
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)
def Get_Nothing(url):
req = urllib2.Request(url) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
# req.add_header()#添加单个头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r'.*?'
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)
def Get_Chaptername_Chapterhref(url):
req = urllib2.Request(url) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
# req.add_header()#添加单个头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r'
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)
def Get_Content(url):
req = urllib2.Request(url) # 实例将要请求的对象
req.headers = headers # 替换所有头信息
# req.add_header()#添加单个头信息
res = urllib2.urlopen(req)
html = res.read().decode('gbk').encode('utf8') # decode 解码 ,解码成unicode
# return html
reg = r'(.*?)
'
reg = re.compile(reg) # 增加匹配效率,正则匹配返回的类型为list
return re.findall(reg, html)
mysql = Sql() #实例化一个Sql()类对象
#主程序中标记部分为测试用的,测试前最好将插表语句给kill(注释)掉
if __name__ == "__main__":
for i in range(5,12):
for (novelhref,novelname),(author),(novelintroduction),(novelimge) in zip(GetNovelname(i),Get_Author(i),GetNovelIntroduction(i),GetImg(i)) :
for sort in GetSort(i):
for toppicture in Gettoppicture(i):
if toppicture == '/kukuku/images/only2.png':
toppicture = '连载中'
else:
toppicture = '完结'
# lastrowid=mysql.AddNovelcopy(novelname=novelname, novelintroduction=novelintroduction, sort=sort,author=author, toppicture=toppicture, novelimge=str(novelimge))
# print('-------------------------------------------------------------------------------------------')
# mysql.AddNovelcopy(novelname=novelname, novelintroduction=novelintroduction, sort=sort,author=author, toppicture=toppicture, novelimge=str(novelimge))
for url in Get_Nothing(novelhref):
# print('-------------------------------------url------------------------------------------------')
for chapterhref, chaptername in Get_Chaptername_Chapterhref(url):
con = ''
for content in Get_Content(chapterhref):
con = con + content
# print(con)
# print('skjfjsfgsgfjksaj')
# mysql.AddChaptername(novelid=lastrowid,toppicture=toppicture,content= con,chaptername= chaptername,novelname= novelname)
如果程序运行没有报错则成功
2. 接下来将写django框架,由于django不适合全部贴出来,所以只捡主要的东西贴出来。
这是django框架所有的文件
(1)setting.py 中的主要配置
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'novel.apps.NovelConfig',#加上你添加的app视图
]
#数据库配置
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql', #连接mysql
'NAME': 'novel', #库名
'USER':'root', #用户名
'PASSWORD':'123456', #密码
'HOST':'127.0.0.1', #本机域名
'PORT':'3306', #端口号
}
}
Urls.py中的主要配置
from django.conf.urls import url
from django.contrib import admin
from novel import views
urlpatterns = [
url(r'^admin/', admin.site.urls), #django自带的管理员界面
url(r'^$',views.index), #小说网站首页显示的视图
url(r'^book_(?P
url(r'^book_(\d+)/kan_(?P
url(r'^book_(\d+)/kan_(?P
]
Models.py中的代码:
打开cmd,cd到你的工程文档执行python2(不行的话试试python)
manage.py inspectdb > novel/models.py。将mysql中的novel库中的内容导入models.py文件
中自动生成以下类(建议如果启动服务报错,将下面类中的False改成Ture)
from __future__ import unicode_literals
from django.db import models
class Chaptercopy(models.Model):
chapterid = models.AutoField(primary_key=True)
novelid = models.ForeignKey('Novelcopy', models.DO_NOTHING, db_column='novelid', blank=True, null=True)
toppicture = models.CharField(max_length=100, blank=True, null=True)
content = models.TextField(blank=True, null=True)
chaptername = models.CharField(max_length=100, blank=True, null=True)
novelname = models.CharField(max_length=200, blank=True, null=True)
class Meta:
managed = True
db_table = 'chaptercopy'
class DjangoMigrations(models.Model):
app = models.CharField(max_length=255)
name = models.CharField(max_length=255)
applied = models.DateTimeField()
class Meta:
managed = False
db_table = 'django_migrations'
class Novelcopy(models.Model):
novelid = models.AutoField(primary_key=True)
sort = models.CharField(max_length=100, blank=True, null=True)
novelname = models.CharField(max_length=100, blank=True, null=True)
novelintroduction = models.TextField(blank=True, null=True)
toppicture = models.CharField(max_length=50, blank=True, null=True)
author = models.CharField(max_length=50, blank=True, null=True)
novelimge = models.CharField(max_length=255, blank=True, null=True)
class Meta:
managed = True
db_table = 'novelcopy'
Views.py中的配置
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.shortcuts import render
from models import Novelcopy
from django.http import HttpResponse
from models import Chaptercopy
def index(request):#视图函数
novels = Novelcopy.objects.order_by('?')[:12]
content = {
'novels':novels[:6],
'novelss':novels[6:]
}
return render(request, 'index.html',context=content)
def content(request,novelid):
novel = Novelcopy.objects.filter(novelid=novelid).first()
chapters = Chaptercopy.objects.filter(novelid=novelid).all()
chapterss = chapters[:8]
content = {
'novel':novel,
'chpaterss':chapterss
}
return render(request,'content.html',context=content)
def contents(request,novelid):
novelsss = Novelcopy.objects.filter(novelid=novelid).first()
chapters = Chaptercopy.objects.filter(novelid=novelid).filter().all()
contents = {
'chapters' :chapters,
'novels':novelsss
}
return render(request,'chapter.html',context=contents)
def chaptercontent(request,chapterid,novelid):
chaptercontent = Chaptercopy.objects.filter(chapterid=chapterid).first()
chapteraa = Novelcopy.objects.filter(novelid=novelid).first()
content = {
'chaptercontent':chaptercontent,
'novels':chapteraa
}
return render(request,'chaptercontent.html',context=content)
大致就是这样了,当然,在做的过程中还有很多还有很多不方便展示的小问题,这些都欢迎各位博友来一起探讨。