爬虫实战:bilibili番剧排名爬取并数据可视化

爬取bilibili的国创动漫的追番人数排行番剧,爬取该网页的番剧信息和各自番播放网页里的简介信息后,将数据存储到Excel和Database,最后通过网页将数据可视化。

1、爬取网页、数据解析、数据保存

from bs4 import BeautifulSoup
import re
import xlwt
import urllib.request,urllib.error
import sqlite3
from io import BytesIO
import gzip
import time
import json
import os
import xlrd
from xlutils.copy import copy
import string

#异步爬取
def main():
    #URL网址
    baseurl = "https://api.bilibili.com/pgc/season/index/result?st=4&order=3&season_version=-1&is_finish=-1©right=-1&season_status=-1&year=-1&style_id=-1&sort=0&page=1&season_type=4&pagesize=20&type=1"
    #1. 爬取网页
    datalist = GetData(baseurl)
    savepath = "bili国漫.xls"#Excel路径
    dbpath = "cartoon.db"#数据库路径

    #3. 保存数据
    SavaData(datalist,savepath)
    SaveDataDb(datalist,dbpath)


findintr = re.compile(r'(.*?)',re.S)#简介的规则(re.S使.匹配换行在内的所有字符)

#解析URL数据
def GetData(baseurl):
    datalist = []
    # 1. 爬取网页
    for i in range(0,35):#获取35个页面信息
        str_temp = "page=" + str(i+1)
        url = baseurl.replace("page=1",str_temp)#替换URL网址
        jsonbili = AskUrl(url)#保存获取的网页源码(源码为json数据)

    #2. 数据解析
        # print(soup.prettify())#使HTML标准化输出;HTML文件中排版:ctrl+alt+l
        datafind = re.findall(r"\"list\":(.+?),\"num\"",str(jsonbili))#返回列表
        jsondata = json.loads(datafind[0])#将已编码的 json字符串解码为 python 对象,转换为字典
        for item in jsondata:
            data = []

            title = item["title"]  # 番剧名称
            data.append(title.strip())
            num = item["order"]  # 追番人数
            data.append(num.strip())
            status = item["index_show"]  # 更新状态
            data.append(status.strip())
            image = item["cover"]#封面链接
            data.append(image.strip())
            link = item["link"]#番剧链接
            data.append(link.strip())
        # 进入番剧链接爬取更多信息
            html = AskLinkUrl(item["link"])  # 保存获取的网页源码
            soup = BeautifulSoup(html, "html.parser")  # 解析为树结构
            for item1 in soup.find_all("span", class_="absolute"):
                item1 = str(item1)  # 转换为字符串用于正则表达式搜索
                introduction = re.findall(findintr, item1)  # 正则表达式搜索
                data.append(introduction[0].strip())#番剧简介

            datalist.append(data)#追加每页信息
    print(datalist)
    return  datalist



#获取指定的URL网页内容
def AskUrl(url):
    head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}#通过Uaer-Agent伪装为谷歌浏览器
    request = urllib.request.Request(url,headers=head)#封装请求
    html = ""
    try:
        response = urllib.request.urlopen(request)#发送请求
        html = response.read().decode("utf-8")#非压缩数据:读取、解码返回值
    except urllib.error.URLError as e:
        if hasattr(e, "code"):  # 出错代码
            print(e.code)
        if hasattr(e, "reason"):  # 出错原因
            print(e.reason)

    return html#返回网页数据


#获取指定的URL网页内容
def AskLinkUrl(url):
    head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}#通过Uaer-Agent伪装为谷歌浏览器
    request = urllib.request.Request(url,headers=head)#封装请求
    html = ""
    try:
        response = urllib.request.urlopen(request)#发送请求
        htmls = response.read()
        package = BytesIO(htmls)#压缩数据:读取、解压、解码返回值
        data = gzip.GzipFile(fileobj = package)
        html = data.read().decode('utf-8')
    except urllib.error.URLError as e:
        if hasattr(e, "code"):  # 出错代码
            print(e.code)
        if hasattr(e, "reason"):  # 出错原因
            print(e.reason)

    return html#返回网页数据


#自适应列宽设置
def Auto_Type(datalist,sheet):
    col_width = []

    for i in range(len(datalist[0])):# 每列
        for j in range(len(datalist)):# 每行
            number1 = number2 = 0#统计字符宽度
            for char in datalist[j][i]:
                try:
                    if 0x4e00 <= ord(char) <= 0x9fff or ord(char) == 0x0020:#unicode字符集(utf-8解码)
                        number1 += 2
                    else:
                        number2 += 1
                except Exception as e:
                    if hasattr(e, "code"):  # 出错代码
                        print(e.code)
                    if hasattr(e, "reason"):  # 出错原因
                        print(e.reason)
            number = number1 + number2
            if j == 0:
                col_width.append(number)# 数组增加一个元素
            else:
                if col_width[i] < number:# 获得每列中的内容的最大宽度
                    col_width[i] = number
        width = 256*(col_width[i]+1)
        if width >= 65535:
            width = 65535
        sheet.col(i).width = width#设置列宽


#保存数据到Excel
def SavaData(datalist,savepath):
    if not(os.path.isfile(savepath)):
        book = xlwt.Workbook(encoding="utf-8")#创建文件
        sheet = book.add_sheet("bili国漫")#创建表单
        Auto_Type(datalist, sheet)#自适应列宽
        print("表格创建成功\n")
    else:
        rb = xlrd.open_workbook(savepath,formatting_info=True)#打开文件
        book = copy(rb)
        sheet = book.get_sheet(0)#打开表单
        print("表格打开成功\n")
    col = ["番剧名称","追番人数","更新状态","封面链接","番剧名称","番剧简介"]         
    for i in range(len(datalist[0])):
        sheet.write(0,i,col[i])#写入第一行
    for i in range(len(datalist)):#存入数据
        print("正在写入第%s条"%(i+1))
        data = datalist[i]
        for j in range(len(datalist[0])):
            sheet.write(i+1,j,data[j])

    book.save(savepath)#保存数据


#数据存储到数据库
def SaveDataDb(datalist,dbpath):
    Init_Db(dbpath)#创建数据表
    con = sqlite3.connect(dbpath)#连接数据库
    cur = con.cursor()#建立游标
    for data in datalist:
        for index in range(len(data)):
            data[index] = "'"+data[index]+"'"
        sql = '''
            insert into bilicartoon(
            name, number, status, image, link, introduction)
            values(%s)'''%",".join(data)
        print(sql)
        cur.execute(sql)#执行操作
        con.commit()#提交操作
    cur.close()#关闭游标
    con.close()#关闭数据库


#数据库初始化
def Init_Db(dbpath):
    sql = '''
        create table if not exists bilicartoon(
        id integer primary key autoincrement,
        name text,
        number int,
        status text,
        image text,
        link text,
        introduction text
        )
    '''#没有bilicartoon时创建数据表
    connect = sqlite3.connect(dbpath)#连接数据库
    cursor = connect.cursor()#建立游标
    cursor.execute(sql)#执行操作
    connect.commit()#提交操作
    connect.close()#关闭数据库

    print("创建/打开数据库成功\n")


#程序执行入口
if __name__ == "__main__":
    #调用函数
    main()
    print("爬取完毕")

注意:
①追番人数页面是动态网页,需要异步爬取。
②各番播放网页为静态网页,需要同步爬取。
③保存到Excel时最好设置自适应的表格排版。

2、数据可视化

路由设置:

from flask import Flask,render_template
import os.path
import sqlite3

app = Flask(__name__)


@app.route('/')
def index():
    return render_template("index.html")

@app.route('/index.html')
def index1():
    return render_template("index.html")

@app.route('/fan.html')
def fan():
    datalist = []
    dir_path = os.path.dirname(os.path.abspath(__file__))  # 使用绝对路径可连接数据库
    db_path = os.path.join(dir_path, "analysis/cartoon.db")
    con = sqlite3.connect(db_path)
    cur = con.cursor()
    sql = "select * from bilicartoon"
    data = cur.execute(sql)
    for item in data:
        datalist.append(item)
    cur.close()
    con.close()
    # print(datalist)
    return render_template("fan.html",fans = datalist)

@app.route('/data.html')
def data():
    return render_template("data.html")

@app.route('/contact.html')
def contact():
    return render_template("contact.html")

if __name__ == '__main__':
    app.run(debug = True)

词云图制作:

#获取词云
con = sqlite3.connect('analysis/cartoon.db')
cur = con.cursor()
sql = 'select introduction from bilicartoon'
data = cur.execute(sql)
text = ""
for item in data:
    text = text + item[0]
# print(text)
cur.close()
con.close()

#分词
cut = jieba.cut(text)
string = ' '.join(cut)
print(len(string))#分词个数

#遮罩图设置
img = Image.open('cloud3.jpg')#白底图
img_array = np.array(img)#将图片转换为数组,做计算
wc = WordCloud(
    background_color='white',
    mask=img_array,
    font_path="simhei.ttf"#电脑字体路径C:\Windows\Fonts
)
wc.generate_from_text(string)

#绘图
fig = plt.figure(1)
plt.imshow(wc)
plt.axis('off')#不显示坐标轴
# plt.show()#显示生成词云图
plt.savefig('word3.jpg',dpi=1000)#词云图文件输出保存

首页:

<!DOCTYPE HTML>
<html lang="zh-CN">
	<head>
		<title></title>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
		<link rel="stylesheet" type="text/css" href="../static/css/bootstrap.min.css"/>
		<link rel="stylesheet" type="text/css" href="../static/css/main.css"/>
        <style>
        </style>
	</head>
	<body>
		<nav class="navbar navbar-default">
			<div class="container">
				<a class="logo pull-left" href="#">
					<h1>爬虫<span>实例</span></h1>
				</a>
				<div class="navbar-header">
					<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false">
				        <span class="sr-only">Toggle navigation</span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				   </button>
				</div>
				<div class="collapse navbar-collapse " id="bs-example-navbar-collapse">
					<ul class="nav navbar-nav navbar-right">
						<li class="navBg"></li>
						<li class="active">
							<a href="index.html">网站首页</a>
						</li>
						<li>
							<a href="fan.html">国创番剧</a>
						</li>
                        <li>
                            <a href="data.html">数据统计</a>
                        </li>
						<li>
							<a href="contact.html">联系作者</a>
						</li>
					</ul>
				</div>
			</div>
		</nav>
		<!--space-->
		<div class="space"></div>
		<!--honor-->
		<div class="honor padT80 padB80 greyBg">
			<div class="container">
				<section class="title">
					<h2 style="font-size: 40px">可视化内容</h2>
				</section>
				<div class="row padT80">
					<div class="col-sm-4">
						<div class="honty">
							<div>
								<div class="ty"><span>A</span></div>
								<div class="tycon">
									<h3 style="font-size: 22px">国创番剧名单</h3>
									<p>FAN PLAY NAME</p>
								</div>
							</div>
                            <p style="font-size: 18px">
								为用户展示bilibili追番人数排行榜的所有番剧信息~
							</p>
						</div>
					</div>	
					<div class="col-sm-4">
						<div class="honty">
							<div>
								<div class="ty"><span>B</span></div>
								<div class="tycon">
									<h3 style="font-size: 22px">数据分析展示</h3>
									<p>DATA ANALYSIS</p>
								</div>
							</div>
                            <p style="font-size: 18px">
								为用户展示番剧的词云图~
							</p>
						</div>
					</div>
					<div class="col-sm-4">
						<div class="honty">
							<div>
								<div class="ty"><span>C</span></div>
								<div class="tycon">
									<h3 style="font-size: 22px">联系作者</h3>
									<p>CONTACT THE AUTHOR</p>
								</div>
							</div>
                            <p style="text-align: center;font-size: 18px">
								邮件联系作者~
							</p>
						</div>
					</div>
				</div>
			</div>
		</div>
		<footer>
			<nav>
				<ul>
					<li><a href="index.html">网站首页</a></li>
					<li><a href="fan.html">国创番剧</a></li>
					<li><a href="data.html">数据统计</a></li>
					<li><a href="contact.html">联系作者</a></li>
				</ul>
			</nav>
			<p>
				<span class="glyphicon glyphicon-phone-alt"></span>123-12345678
				<span class="glyphicon glyphicon-earphone"></span>12345678900
				<span class="glyphicon glyphicon-envelope"></span>123456@123.com
			</p>
		</footer>
		<script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script>
	</body>
</html>

番剧页:

<!DOCTYPE HTML>
<html lang="zh-CN">
	<head>
		<title></title>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
		<link rel="stylesheet" type="text/css" href="../static/css/bootstrap.min.css"/>
		<link rel="stylesheet" type="text/css" href="../static/css/main.css"/>
	    <style type="text/css">
        h2{
            font-size: 40px;}
        table {
            width: 100%;
	        font-family: verdana,arial,sans-serif;
	        color:#333333;
	        border-width: 1px;
	        border-color: #999999;
	        border-collapse: collapse;  }
        th {
            font-size: 18px;
	        background-color:#c3dde0;
	        border-width: 1px;
	        padding: 8px;
	        border-style: solid;
	        border-color: #a9c6c9;  }
        tr {
	        background-color:#d4e3e5;  }
        td {
            font-size: 15px;
	        border-width: 1px;
	        padding: 8px;
	        border-style: solid;
	        border-color: #a9c6c9;  }
        </style>
    </head>
	<body>
		<nav class="navbar navbar-default">
			<div class="container">
				<a class="logo pull-left" href="#">
					<h1>爬虫<span>实例</span></h1>
				</a>
				<div class="navbar-header">
					<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false">
				        <span class="sr-only">Toggle navigation</span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				   </button>
				</div>
				<div class="collapse navbar-collapse " id="bs-example-navbar-collapse">
					<ul class="nav navbar-nav navbar-right">
						<li class="navBg"></li>
						<li class="active">
							<a href="index.html">网站首页</a>
						</li>
						<li>
							<a href="fan.html">国创番剧</a>
						</li>
                        <li>
                            <a href="data.html">数据统计</a>
                        </li>
						<li>
							<a href="contact.html">联系作者</a>
						</li>
					</ul>
				</div>
			</div>
		</nav>
		<!--space-->
		<div class="space"></div>
		<!--honor-->
		<div class="honor padT80 padB80 greyBg">
			<div class="container">
            <section class="title">
					<h2>番剧信息</h2>
            </section>
            <table ><!--列表,样式为bootstrap-->
                <tr><!------>
                  <th>人气排名</th>
                  <th>番剧名称</th>
                  <th>追番人数</th>
                  <th>更新状态</th>
              </tr>
              <!--显示信息-->
              {% for fan in fans %}
                 <tr onmouseover="this.style.backgroundColor='#C8C8C8 ';" onmouseout="this.style.backgroundColor='#d4e3e5';"><!------>
                  <td>{{ fan[0] }}</td>
                  <td>
                      <a href="{{ fan[5] }}"target="_blank"><!--打开新网页-->
                      {{ fan[1] }}
                      </a>
                  </td>
                  <td>{{ fan[2] }}</td>
                  <td>{{ fan[3] }}</td>
              </tr>
              {% endfor %}
            </table>
			</div>
		</div>
		<footer>
			<nav>
				<ul>
					<li><a href="index.html">网站首页</a></li>
					<li><a href="fan.html">国创番剧</a></li>
					<li><a href="data.html">数据统计</a></li>
					<li><a href="contact.html">联系作者</a></li>
				</ul>
			</nav>
			<p>
				<span class="glyphicon glyphicon-phone-alt"></span>123-12345678
				<span class="glyphicon glyphicon-earphone"></span>12345678900
				<span class="glyphicon glyphicon-envelope"></span>123456@123.com
			</p>
		</footer>
		<script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script>
	</body>
</html>

词云页:

<!DOCTYPE HTML>
<html lang="zh-CN">
	<head>
		<title></title>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
		<link rel="stylesheet" type="text/css" href="static/css/bootstrap.min.css"/>
		<link rel="stylesheet" type="text/css" href="static/css/main.css"/>
	</head>
	<body>
		<nav class="navbar navbar-default">
			<div class="container">
				<a class="logo pull-left" href="#">
					<h1>爬虫<span>实例</span></h1>
				</a>
				<div class="navbar-header">
					<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false">
				        <span class="sr-only">Toggle navigation</span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				   </button>
				</div>
				<div class="collapse navbar-collapse " id="bs-example-navbar-collapse">
					<ul class="nav navbar-nav navbar-right">
						<li class="navBg"></li>
						<li class="active">
							<a href="index.html">网站首页</a>
						</li>
						<li>
							<a href="fan.html">国创番剧</a>
						</li>
                        <li>
                            <a href="data.html">数据统计</a>
                        </li>
						<li>
							<a href="contact.html">联系作者</a>
						</li>
					</ul>
				</div>
			</div>
		</nav>
		<!--space-->
		<div class="space"></div>
		<!--case-->
		<div class="case padT80 padB80">
			<div class="container">
				<section class="title">
					<h2>葫芦娃词云</h2>
				</section>
				<ul class="row padT80">
					<li class="col-sm-4 col-xs-6">
                        <img src="../static/images/case/word1.jpg" height="350" width="300"/>
					</li>
					<li class="col-sm-4 col-xs-6">
                        <img src="../static/images/case/word2.jpg" height="350" width="300"/>
					</li>
					<li class="col-sm-4 col-xs-6">
                        <img src="../static/images/case/word3.jpg" height="350" width="300"/>
{#					#}
{#				#}
			</div>
		</div>
		<footer>
			<nav>
				<ul>
					<li><a href="index.html">网站首页</a></li>
					<li><a href="fan.html">国创番剧</a></li>
					<li><a href="data.html">数据统计</a></li>
					<li><a href="contact.html">联系作者</a></li>
				</ul>
			</nav>
			<p>
				<span class="glyphicon glyphicon-phone-alt"></span>123-12345678
				<span class="glyphicon glyphicon-earphone"></span>12345678900
				<span class="glyphicon glyphicon-envelope"></span>123456@123.com
			</p>
		</footer>
		<script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script>
	</body>
</html>

联系页:

<!DOCTYPE HTML>
<html lang="zh-CN">
	<head>
		<title></title>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
		<link rel="stylesheet" type="text/css" href="../static/css/bootstrap.min.css"/>
		<link rel="stylesheet" type="text/css" href="../static/css/main.css"/>
	</head>
	<body>
		<nav class="navbar navbar-default">
			<div class="container">
				<a class="logo pull-left" href="#">
					<h1>爬虫<span>实例</span></h1>
				</a>
				<div class="navbar-header">
					<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false">
				        <span class="sr-only">Toggle navigation</span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				        <span class="icon-bar"></span>
				   </button>
				</div>
				<div class="collapse navbar-collapse " id="bs-example-navbar-collapse">
					<ul class="nav navbar-nav navbar-right">
						<li class="navBg"></li>
						<li class="active">
							<a href="index.html">网站首页</a>
						</li>
						<li>
							<a href="fan.html">国创番剧</a>
						</li>
                        <li>
                            <a href="data.html">数据统计</a>
                        </li>
						<li>
							<a href="contact.html">联系作者</a>
						</li>
					</ul>
				</div>
			</div>
		</nav>
		<!--space-->
		<div class="space"></div>
		<div class="conPg">
			<div class="container padT80">
				<div class="address row padT80 padB80">
					<div class="col-sm-6 col-xs-12">
						<section class="title">
							<h2>联系作者</h2>
						</section>
						<ul class="padT80">
							<li><span class="glyphicon glyphicon-phone-alt"></span>座机号码:123-123455678</li>
							<li><span class="glyphicon glyphicon-map-marker"></span>作者地址:xx省xx市xx大学</li>
							<li><span class="glyphicon glyphicon-envelope"></span>QQ邮箱:123456@qq.com</li>
							<li><span class="glyphicon glyphicon-phone"></span>联系电话:12345678900</li>
						</ul>
					</div>
					<div class="col-sm-6 col-xs-12 padT80">
						<form>
							<input placeholder="姓名" type="name" id="name"/>
							<input placeholder="邮箱" type="email" id="email"/>
							<input placeholder="电话" type="text" id="text"/>
							<textarea placeholder="消息" rows="5"></textarea>
						</form>
						<a class="btn btn-primary">发送</a>
					</div>
					
				</div>
			</div>
		</div>
		<footer>
			<nav>
				<ul>
					<li><a href="index.html">网站首页</a></li>
					<li><a href="fan.html">国创番剧</a></li>
					<li><a href="data.html">数据统计</a></li>
					<li><a href="contact.html">联系作者</a></li>
				</ul>
			</nav>
			<p>
				<span class="glyphicon glyphicon-phone-alt"></span>123-12345678
				<span class="glyphicon glyphicon-earphone"></span>12345678900
				<span class="glyphicon glyphicon-envelope"></span>123456@123.com
			</p>
		</footer>
		<script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script>
		<script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script>
	</body>
</html>

3、结果

表格:
爬虫实战:bilibili番剧排名爬取并数据可视化_第1张图片
数据库:
爬虫实战:bilibili番剧排名爬取并数据可视化_第2张图片
首页:
爬虫实战:bilibili番剧排名爬取并数据可视化_第3张图片
番剧页:
爬虫实战:bilibili番剧排名爬取并数据可视化_第4张图片
词云页:
爬虫实战:bilibili番剧排名爬取并数据可视化_第5张图片
联系页:
爬虫实战:bilibili番剧排名爬取并数据可视化_第6张图片

你可能感兴趣的:(Spiders)