本文涉及的代码在最后,希望能获取你的认可和小小的赞
更为详细的代码介绍和课程设计在我的Python项目专栏中,有需要的uu可以自行查看,代码链接在总结的github链接处
项目链接
一 课程设计任务说明及总体设计说明
二 总体设计说明
三 核心功能与代码
四 问题与讨论
参考了:IT私塾
想看最终效果图可以直接移步至三 核心功能与代码
1.请求网页必需的requests模块
2.处理数据格式的json
3.xlwt是用来形成excel文件
4.最后的sqlite3是将数据存储到数据库文件.db中
import requests as rq, json as js
import xlwt
import sqlite3 # 进行SQLite数据库操作
爬取热门榜单的数据
以下注释可供参考
datalist = []
for page in range(1,5):#这边的参数是自己测试出来的
# 1.获取html文件
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
r = rq.get('https://api.bilibili.com/x/web-interface/popular?ps=50&pn={}'.format(page))
# 2. 设置对于的编码方式
r.encoding = r.apparent_encoding#从内容中分析出的响应内容编码
# 3. 将已编码的 JSON 字符串解码为 Python 对象即字典
data = js.loads(r.content.decode('utf8'))['data']['list']
#print('hhhh'),用来测试是否进入循环
# 4. 遍历字符串信息,d为字典型
for d in data:
count = []
count.append(d['title']) # 标题
count.append(d['tname']) # 类型
count.append(d['owner']['name']) # up主名字
count.append(d['stat']['view']) # 播放量
count.append(d['stat']['reply']) # 评论
count.append(d['stat']['like']) # 点赞
count.append(d['stat']['coin']) # 投币
count.append(d['stat']['favorite']) # 收藏
count.append(d['stat']['share']) # 分享
count.append(d['short_link']) # 链接
datalist.append(count)
#print(datalist)#输出测试
b=sorted(datalist,key=lambda x:x[3],reverse=True)#根据播放量排序
保存数据到excel文件中
def saveData(datalist,savepath):
print("save.......")
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
sheet = book.add_sheet('哔哩哔哩Top200', cell_overwrite_ok=True) #创建工作表
col = ("标题","类型","up主","播放量","评论","点赞","投币","收藏","分享","链接")
for i in range(0,10):
sheet.write(0,i,col[i]) #列名
for i in range(0,200):
# print("第%d条" %(i+1)) #输出语句,用来测试
data = datalist[i]
for j in range(0,10):
sheet.write(i+1,j,data[j]) #数据
book.save(savepath) #保存
savepath = "bilibili.xls"
saveData(b,savepath)
保存数据到数据库中
def saveData2DB(datalist,dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
#data[index] = '"' + data[index] + '"'
data[index] = str(data[index])
data[index] = '"' + data[index] + '"'
sql = '''
insert into bilibili200(
title,ttype,up,views,reply,dianzan,coin,favorite,share,link)
values (%s)'''%",".join(data)
print(",".join(data))
print(sql) #输出查询语句,用来测试
cur.execute(sql)
conn.commit()
cur.close
conn.close()
def init_db(dbpath):
sql = '''
create table bilibili200
(
id integer primary key autoincrement,
title varchar,
ttype varchar,
up varchar,
views numeric,
reply numeric,
dianzan numeric,
coin numeric,
favorite numeric,
share numeric,
link text
)
''' #创建数据表
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
# dbpath = "bilibili.db" #当前目录新建数据库,存储进去
# saveData2DB(b,dbpath)
csv文件存储 对爬取热门视频类型排序比较,再存取到csv文件
# -*- coding: utf-8 -*-
import requests as rq, json as js
import csv
datalist = []
for page in range(1,5):
# 1.获取html文件
r = rq.get('https://api.bilibili.com/x/web-interface/popular?ps=50&pn={}'.format(page))
# 2. 设置对于的编码方式
r.encoding = r.apparent_encoding
# 3. 将已编码的 JSON 字符串解码为 Python 对象即字典
data = js.loads(r.content.decode('utf8'))['data']['list']
#print('hhhh'),用来测试是否进入循环
# 4. 遍历字符串信息,d为字典型
#print(data)
for d in data:
count = []
count.append(d['title']) # 标题
count.append(d['tname']) # 类型
count.append(d['owner']['name']) # up主名字
count.append(d['stat']['view']) # 播放量
count.append(d['stat']['reply']) # 评论
count.append(d['stat']['like']) # 点赞
count.append(d['stat']['coin']) # 投币
count.append(d['stat']['favorite']) # 收藏
count.append(d['stat']['share']) # 分享
datalist.append(count)
s = set() #set创建一个无序不重复元素集
for i in data: #遍历循环字典
s.add(i['tname']) #这样就能获取到所有不重复的电影类型
print(i['tname'])
csvfile = open('bilibili热门分类.csv', 'w', newline='',encoding="utf-8")#打开文件,没有则创建
writer = csv.writer(csvfile)
writer.writerow(['分类', '视频', '类型', '点赞', '评论','综合评分'])
list = []
for i in s:
for d in data:
t = (i, d["title"], d["tname"],
d['stat']['like'],
d['stat']['reply'],(d['stat']['like']*0.3 +d['stat']['reply']*0.7))
list.append(t)
#这里开始就是对视频类型的排序
for j in range(len(list) - 1, 0, -1):
if (list[j - 1][0] != i):
#print(list[j-1][0])
break
if (list[j - 1][4] < list[j][4]):
# print(list[j - 1][4],list[j][4])
tmp = list[j - 1]
list[j - 1] = list[j]
list[j] = tmp
writer.writerows(list)
csvfile.close()
csvfile = open('bilibili热门分类-1.csv', 'w', newline='',encoding="utf-8")#打开文件,没有则创建
writer = csv.writer(csvfile)
writer.writerow([ '视频', '类型', '点赞', '评论','综合评分'])
list1 = []
for d in data:
t = ( d["title"], d["tname"],
d['stat']['like'],
d['stat']['reply'],(d['stat']['like']*0.3 +d['stat']['reply']*0.7))
list1.append(t)
# print(list1[0][4])
b=sorted(list1,key=lambda x:x[4],reverse=True)
writer.writerows(b)
csvfile.close()
一次小小的项目经验吧
已上传至github上,有需要可以自取(切勿做非法用途)
项目地址
bilibilispider.py
import requests as rq, json as js
import xlwt
import sqlite3 # 进行SQLite数据库操作
datalist = []
for page in range(1,5):#这边的参数是自己测试出来的
# 1.获取html文件
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
r = rq.get('https://api.bilibili.com/x/web-interface/popular?ps=50&pn={}'.format(page),headers=headers)
# 2. 设置对于的编码方式
r.encoding = r.apparent_encoding#从内容中分析出的响应内容编码
# 3. 将已编码的 JSON 字符串解码为 Python 对象即字典
data = js.loads(r.content.decode('utf8'))['data']['list']
#print('hhhh'),用来测试是否进入循环
# 4. 遍历字符串信息,d为字典型
for d in data:
count = []
count.append(d['title']) # 标题
count.append(d['tname']) # 类型
count.append(d['owner']['name']) # up主名字
count.append(d['stat']['view']) # 播放量
count.append(d['stat']['reply']) # 评论
count.append(d['stat']['like']) # 点赞
count.append(d['stat']['coin']) # 投币
count.append(d['stat']['favorite']) # 收藏
count.append(d['stat']['share']) # 分享
count.append(d['short_link']) # 链接
datalist.append(count)
#print(datalist)#输出测试
b=sorted(datalist,key=lambda x:x[3],reverse=True)
#定义函数存储数据到excel文件中
def saveData(datalist,savepath):
print("save.......")
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
sheet = book.add_sheet('哔哩哔哩Top200', cell_overwrite_ok=True) #创建工作表
col = ("标题","类型","up主","播放量","评论","点赞","投币","收藏","分享","链接")
for i in range(0,10):
sheet.write(0,i,col[i]) #列名
for i in range(0,200):
# print("第%d条" %(i+1)) #输出语句,用来测试
data = datalist[i]
for j in range(0,10):
sheet.write(i+1,j,data[j]) #数据
book.save(savepath) #保存
savepath = "bilibili.xls"
saveData(b,savepath)
def saveData2DB(datalist,dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
#data[index] = '"' + data[index] + '"'
data[index] = str(data[index])
data[index] = '"' + data[index] + '"'
sql = '''
insert into bilibili200(
title,ttype,up,views,reply,dianzan,coin,favorite,share,link)
values (%s)'''%",".join(data)
print(",".join(data))
print(sql) #输出查询语句,用来测试
cur.execute(sql)
conn.commit()
cur.close
conn.close()
def init_db(dbpath):
sql = '''
create table bilibili200
(
id integer primary key autoincrement,
title varchar,
ttype varchar,
up varchar,
views numeric,
reply numeric,
dianzan numeric,
coin numeric,
favorite numeric,
share numeric,
link text
)
''' #创建数据表
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
# dbpath = "bilibili.db" #当前目录新建数据库,存储进去
# saveData2DB(b,dbpath)```