# -*- codeing = utf-8 -*-
# @Time : 2022/1/11 22:39
# @Author : lj
# @File : spider.PY
# @Software: 4{PRODUCT_NAME}
import bs4 # 网页解析,获取数据 对网页的数据进行拆分
import re #正则表达式,进行文字匹配 对数据进行提炼
import urllib.request,urllib.error #指定url 获取网页数据 怕网页
import xlwt #进行excel 操作 存再excel中
import sqlite3 #进行sqllite 数据库操作 存在数据库中
import time
# 主函数
def main():
# 调用函数
url = "https://movie.douban.com/top250?start="
datalist1 = allData(url)
# savepath = "豆瓣电影top250.xls"
# savedata(datalist1,savepath)
dbpath = "move.db"
savedatasql(datalist1,dbpath)
#匹配所需内容的正则表达式
linkpattern = re.compile(r'')
#匹配图片的正则表达式
imagepattern = re.compile(r'',re.S)#re.S 忽略换行符,.表示除了换行符以外的所有字符
#匹配影名的正则表达式
namepattern = re.compile(r'(.*)')
# 影片评分
gradepattern = re.compile(r' ')
# 评价人数
peoplepattern = re.compile(r'(\d*)人评价')#(\d*) 零个或多个
#概况
thinkpattern = re.compile(r'(.*)')
#影片的相关内容
contentpattern = re.compile(r'(.*?)
',re.S)#忽略换行符
#1、爬取一个网页
def getData(url1):
head = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"}
request = urllib.request.Request(url1,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html #返回给调用的地方
# 2、爬取所有网页,匹配分析
def allData(url):
datalist = []
for i in range(0, 10): # 左闭右开 调用十次,每次二十五条信息
url1 = url + str(i * 25)
html = getData(url1) #保存获取到的网页源码
time.sleep(1)
# 逐页解析
soup = bs4.BeautifulSoup(html,"html.parser") #返回树型结构
for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,返回列表,class加下划线
data = []
item = str(item)
#link 获取到影片的超链接
link = re.findall(linkpattern,item)[0]
data.append(link)
# 影片图片
image = re.findall(imagepattern,item)[0]
data.append(image)
# 影片名
name = re.findall(namepattern,item)
if(len(name)==2):
chinaname = name[0]
data.append(chinaname)
outername = name[1].replace("/","")#.replace("/","") 列表内置的方法,将/替换为空""
data.append(outername)
else:
data.append(name[0])
data.append(' ')#外文名空出来
# 影片评分
grade = re.findall(gradepattern,item)[0]
data.append(grade)
# 影片评价人数
people = re.findall(peoplepattern, item)[0]
data.append(people)
# 影片概况
think = re.findall(thinkpattern, item)
if len(think) != 0:
think = think[0].replace("。","")
data.append(think)
else:
data.append(" ")
# 影片内容
content = re.findall(contentpattern, item)[0]
content = re.sub('
(\s+)?'," ",content)#替换内容中多余的符号和内容
content = re.sub('/'," ",content)
data.append(content.strip())#去除列表中的空格
datalist.append(data)
return datalist
#3、保存数据到excel
# def savedata(datalist1,savepath):
# workplace = xlwt.Workbook(encoding="utf-8",style_compression=0)#style_compression=0·压缩样式
# worksheet = workplace.add_sheet("豆瓣电影top250",cell_overwrite_ok="true")#cell_overwrite_ok=true 是否可以覆盖
# col = ('电影详情链接','电影图片链接','影片中文名','影片外文名','评分','评价人数','概况','影片内容')
# for i in range(0,8):
# worksheet.write(0,i,col[i])
# for i in range(0,250):
# print("打印了%d条" %(i+1))
# databuffer = datalist1[i]
# for j in range(0,8):
# worksheet.write(i+1,j,databuffer[j])
# workplace.save(savepath) #保存
#3、保存数据到数据库
def savedatasql(datalist1,dbpath):#dbpath 数据库的路径位置
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()#获取一个游标,存放sql语句的执行结果
#cur 获得了游标可存放执行结果的对象
#将datalist1中的数据依次遍历写入数据库
for data in datalist1:
for index in range(len(data)):
if index == 4 or index == 5:
continue
# data[index] = "'" + data[index].replace(u'\xa0','') + "'"
data[index] = data[index].replace("'", "")
data[index] = "'"+data[index]+"'"
sql = '''
insert into move250(
move_link,img_link,move_chinaname,move_foriername,grade,numbers,introduction,content)
values (%s)'''%",".join(data)
#print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
print("write successful")
#创建数据库
def init_db(dbpath):
sql = '''
create table move250
(
id integer primary key autoincrement,
move_link text,
img_link text,
move_chinaname varchar,
move_foriername varchar,
grade numeric,
numbers numeric,
introduction text,
content text
)
'''
c = sqlite3.connect(dbpath)
buffer = c.cursor() # 获取游标
buffer.execute(sql) # 内置的方法执行sql语句
c.commit() # 数据提交,写入数据库
c.close() # 数据库关闭
print("database create successful")
if __name__ == "__main__":
#调用函数
main()
print("爬取完毕!")
import sqlite3
from flask import Flask,render_template
app = Flask(__name__)
@app.route('/')
def first(): # put application's code here
return render_template("index-1.html")
# 每一个函数对应一个路由解析
@app.route('/index')
def first1():
return render_template('index-1.html')
@app.route('/movie')
def mv():
datalist = []
con = sqlite3.Connection('move.db')
cursor = con.cursor()
sql = "select * from move250"
data = cursor.execute(sql)
for items in data:
datalist.append(items)
cursor.close()
con.close()
return render_template('about.html',mmm = datalist)
@app.route('/score')
def sc():
x = []
y = []
con = sqlite3.Connection('move.db')
cursor = con.cursor()
sql = "select grade,count(grade) from move250 group by grade"
data1 = cursor.execute(sql)
for ite in data1:
x.append(ite[0])
y.append(ite[1])
cursor.close()
con.close()
return render_template('services.html',score = x,number = y)
@app.route('/word')
def wd():
return render_template('projects-details.html')
@app.route('/team')
def te():
return render_template('team.html')
if __name__ == '__main__':
app.run()
DOCTYPE html>
<html lang="en">
<head>
<style>
#container{
border: 2px solid red;
height: 1000px;
width: 1200px;
top: 50px;
left: 100px;
}
style>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="Buskey - Corporate Business Template">
<title>Buskey - Corporate Business Templatetitle>
<link href="../static/css/plugins.min.css" rel="stylesheet">
<link href="../static/css/flaticon-business-set.css" rel="stylesheet">
<link href="../static/css/style.css" rel="stylesheet">
<link href="../static/css/responsive.css" rel="stylesheet">
<link href="../static/css/css.css" rel="stylesheet">
<link href="../static/css/css1.css" rel="stylesheet">
head>
<body>
<div class="se-pre-con">div>
<div class="top-bar-area bg-theme text-light">
<div class="container">
<div class="row">
<div class="col-md-9">
<div class="info box">
<ul>
<li>
<div class="icon">
<i class="fas fa-map-marker-alt">i>
div>
<div class="info">
<p>
china
p>
div>
li>
<li>
<div class="icon">
<i class="fas fa-envelope-open">i>
div>
<div class="info">
<p>
[email protected]
p>
div>
li>
<li>
<div class="icon">
<i class="fas fa-mobile-alt">i>
div>
<div class="info">
<p>
+123 456 7890
p>
div>
li>
ul>
div>
div>
<div class="topbar-social col-md-3">
div>
div>
div>
div>
<header>
<nav class="navbar navbar-default navbar-sticky bootsnav">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-menu">
<i class="fa fa-bars">i>
button>
<a class="navbar-brand" href="index.html">
<img src="../static/picture/logo-light.png" class="logo logo-display" alt="Logo">
a>
div>
<div class="collapse navbar-collapse" id="navbar-menu">
<ul class="nav navbar-nav navbar-right" data-in="#" data-out="#">
<li class="dropdown">
<a href="index-1.html" class="dropdown-toggle active" data-toggle="dropdown">首页a>
li>
<li>
<a href="about.html">电影列表a>
li>
<li>
<a href="">评分a>
li>
<li class="dropdown">
<a href="projects-details.html" class="dropdown-toggle" data-toggle="dropdown">词云a>
li>
<li class="dropdown">
<a href="team.html" class="dropdown-toggle" data-toggle="dropdown">团队a>
li>
ul>
div>
div>
nav>
header>
<div class="breadcrumb-area shadow dark bg-fixed text-center padding-xl text-light" style="background-image: url(../static/image/21.jpg);">
<div class="container">
<div class="row">
<div class="col-md-6 col-sm-6 text-left">
<h1>影线评分h1>
div>
<div class="col-md-6 col-sm-6 text-right">
div>
div>
div>
div>
<div class="fun-factor-area default-padding text-center bg-fixed shadow theme-hard parallax parralax-shadow" data-parallax="scroll" style="background-image: url(../static/image/12.jpg);">
<div class="container">
<div class="row">
<div class="col-md-12">
<a href="about.html">
<div class="col-md-3 col-sm-6 item">
<div class="fun-fact">
<i class="flaticon-world-map">i>
<div class="timer" data-to="250" data-speed="5000">div>
<span class="medium">经典电影span>
div>
div>
a>
<a href="services.html">
<div class="col-md-3 col-sm-6 item">
<div class="fun-fact">
<i class="flaticon-gears">i>
<div class="timer" data-to="1" data-speed="5000">div>
<span class="medium">评分统计span>
div>
div>
a>
<a href="projects-details.html">
<div class="col-md-3 col-sm-6 item">
<div class="fun-fact">
<i class="flaticon-id-card">i>
<div class="timer" data-to="5693" data-speed="5000">div>
<span class="medium">高频词汇span>
div>
div>
a>
<a href="team.html ">
<div class="col-md-3 col-sm-6 item">
<div class="fun-fact">
<i class="flaticon-id">i>
<div class="timer" data-to="5" data-speed="5000">div>
<span class="medium">专业团队span>
div>
div>
a>
div>
div>
div>
div>
<div class="carousel-services-area bg-gray">
<div class="container-box oh">
<div class="carousel-service-items owl-carousel owl-theme">
<div id="container" >div>
div>
div>
div>
<script src="../static/js/plugins.min.js">script>
<script src="../static/js/main.js">script>
<script type="text/javascript" src="echarts.min.js">script>
<script type="text/javascript">
var dom = document.getElementById("container");
var myChart = echarts.init(dom);
var app = {};
var option;
option = {
xAxis: {
type: 'category',
data: [8.3,8.4,8.5,8.6,8.7,8.8,8.9,9,9.1,9.2,9.3,9.4,9.5,9.6,9.7]
},
yAxis: {
type: 'value'
},
series: [
{
data: [1,3,9,23,40,40,39,15,28,17,20,7,4,3,1,],
type: 'bar',
showBackground: true,
backgroundStyle: {
color: 'rgba(180, 180, 180, 0.2)'
}
}
]
};
option && myChart.setOption(option);
script>
body>
html>
# -*- codeing = utf-8 -*-
# @Time : 2022/2/2 12:53
# @Author : lj
# @File : testcloud.PY
# @Software: 4{PRODUCT_NAME}
import jieba
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import sqlite3
# 主要的流程,1.jieba提取文字,2.生成词云
#准备好数据
con = sqlite3.connect('move.db')
cur = con.cursor()
sql = 'select introduction from move250'
data = cur.execute(sql)
text = ""
for item in data:
text = text + item[0]
# print(text)
cur.close()
con.close()
#文本分词分句
cut = jieba.cut(text)
r = ' '.join(cut)
print(len(r))
# 处理图片的
img = Image.open(r'.\templates\TEST\img\img/ll.jpg')
img_arrays = np.array(img) #将图片转换为数组
#词云库封装一个对象
wc =WordCloud(
background_color='white', #输出图片的颜色
mask=img_arrays, #导入处理好的图片
font_path = 'STXINWEI.TTF' #字体的文件
)
#词云对象处理已经分好的词
wc.generate(r)
#绘制图片,plt matplotlib的库别名
fig = plt.figure(1) #matplotlib库figure方法可绘图
plt.imshow(wc) #按照wc的规则显示图片
plt.axis('off') #不显示x轴
# plt.show() #生成的词云图片
#输出词云图片到文件中,设置清晰度
plt.savefig(r'.\templates\TEST\img\img/wordcloud2.jpg',dpi=800)