环境:
python 3.X
pyecharts
snapshot_phantomjs
numpy as np
pandas as pd
jieba
codecs
requests
re
os
sys
json
#数据爬取 利用bilibiliapi接口
# B站API详情 https://github.com/Vespa314/bilibili-api/blob/master/api.md
import requests
import re
import os
import sys
import json
#item=http://api.bilibili.com/x/reply?type=1&oid=77862151&pn=1&nohot=1&sort=0
info_list = []
item=77862151##视频的av号
def saveTxt(filename,filecontent):
filename = str(filename) + ".txt"
for content in filecontent:
with open(filename, "a", encoding='utf-8') as txt:
txt.write(content[0] +' '+content[1].replace('\n','') + '\n\n')
print("文件写入中")
def getAllCommentList(item):
url = "http://api.bilibili.com/x/reply?type=1&oid=" + str(item) + "&pn=1&sort=0"#&nohot=1
r = requests.get(url)
numtext = r.text
json_text = json.loads(numtext)
commentsNum = json_text["data"]["page"]["count"]
page = commentsNum // 20 + 1
for n in range(1,page):
url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(item)+"&sort=1"#&nohot 是否热门
req = requests.get(url)
text = req.text
json_text_list = json.loads(text)
f=len(json_text_list["data"]["replies"])
for i in range(0,f):
info_list.append(info_list.append(json_text_list["data"]["replies"][i]['content']['message']))#[i["member"]["uname"],
getAllCommentList(item)
saveTxt(item,info_list)
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs
#设置pd的显示长度
pd.set_option('max_colwidth',500)
#载入数据
rows=pd.DataFrame(info_list)
rows.dropna(axis=0, how='any', inplace=True)
segments = []
for index, row in rows.iterrows():
content = row[0]
#TextRank 关键词抽取,只获取固定词性
words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
splitedStr = ''
for word in words:
# 记录全局分词
segments.append({'word':word, 'count':1})
splitedStr += word + ' '
dfSg = pd.DataFrame(segments)
# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()
#导出csv
dfWord.to_csv('keywords.csv',encoding='utf-8')
#数据转换
data=pd.DataFrame(dfWord)
h=list(data.index)
v=list(data.iloc[:,0])
name=pd.DataFrame(h)
value=pd.DataFrame(v)
name=name[0].tolist()
value=value[0].tolist()
words=list(zip(list(name),list(value)))
#生成网页
from pyecharts.render import make_snapshot
import pyecharts.options as opts
from pyecharts.charts import WordCloud
(
WordCloud()
.add(series_name="热点分析", data_pair=datas, word_size_range=[6, 66])
.set_global_opts(
title_opts=opts.TitleOpts(
title="热点分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
.render("basic_wordcloud.html")
)
#生成图
from pyecharts.render import make_snapshot
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
c = (
WordCloud()
.add("", words, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
.set_global_opts(title_opts=opts.TitleOpts(title="WordCloud-shape-diamond"))
)
c.render_notebook()
#生成图片
from snapshot_phantomjs import snapshot
from pyecharts.render import make_snapshot
make_snapshot(snapshot, c.render(), "bar0.png")