使用jieba分词+pyecharts+request库爬取bilibili视频评论并统计词频可视化

环境:
python 3.X
pyecharts
snapshot_phantomjs
numpy as np
pandas as pd
jieba
codecs
requests
re
os
sys
json

#数据爬取 利用bilibiliapi接口

# B站API详情 https://github.com/Vespa314/bilibili-api/blob/master/api.md
import requests
import re
import os
import sys
import json
#item=http://api.bilibili.com/x/reply?type=1&oid=77862151&pn=1&nohot=1&sort=0
info_list = []
item=77862151##视频的av号
def saveTxt(filename,filecontent):
    filename = str(filename) + ".txt"
    for content in filecontent:
        with open(filename, "a", encoding='utf-8') as txt:
            txt.write(content[0] +' '+content[1].replace('\n','') + '\n\n')
        print("文件写入中")
def getAllCommentList(item):
    url = "http://api.bilibili.com/x/reply?type=1&oid=" + str(item) + "&pn=1&sort=0"#&nohot=1
    r = requests.get(url)
    numtext = r.text
    json_text = json.loads(numtext)
    commentsNum = json_text["data"]["page"]["count"]
    page = commentsNum // 20 + 1
    for n in range(1,page):
        url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(item)+"&sort=1"#&nohot 是否热门
        req = requests.get(url)
        text = req.text
        json_text_list = json.loads(text)
        f=len(json_text_list["data"]["replies"])
        for i in range(0,f):
            info_list.append(info_list.append(json_text_list["data"]["replies"][i]['content']['message']))#[i["member"]["uname"],
getAllCommentList(item)
saveTxt(item,info_list)

import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs

#设置pd的显示长度
pd.set_option('max_colwidth',500)

#载入数据
rows=pd.DataFrame(info_list)
rows.dropna(axis=0, how='any', inplace=True)
segments = []
for index, row in rows.iterrows():
    content = row[0]
    #TextRank 关键词抽取,只获取固定词性
    words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
    splitedStr = ''
    for word in words:
        # 记录全局分词
        segments.append({'word':word, 'count':1})
        splitedStr += word + ' '
dfSg = pd.DataFrame(segments)

# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()
#导出csv
dfWord.to_csv('keywords.csv',encoding='utf-8')
#数据转换
data=pd.DataFrame(dfWord)
h=list(data.index)
v=list(data.iloc[:,0])
name=pd.DataFrame(h)
value=pd.DataFrame(v)
name=name[0].tolist()
value=value[0].tolist()
words=list(zip(list(name),list(value)))
#生成网页
from pyecharts.render import make_snapshot
import pyecharts.options as opts
from pyecharts.charts import WordCloud
(
    WordCloud()
    .add(series_name="热点分析", data_pair=datas, word_size_range=[6, 66])
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="热点分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
    .render("basic_wordcloud.html")
)
#生成图
from pyecharts.render import make_snapshot
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
c = (
    WordCloud()
    .add("", words, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title="WordCloud-shape-diamond"))
)

c.render_notebook()

#生成图片
from snapshot_phantomjs import snapshot
from pyecharts.render import make_snapshot
make_snapshot(snapshot, c.render(), "bar0.png")

你可能感兴趣的:(数据科学,爬虫)