弹幕数据爬取及可视化(python实现)

弹幕数据爬取及可视化(python实现)


数据爬取

from urllib import request
from bs4 import BeautifulSoup
def get_html(url):
    req = request.urlopen(url)
    html = req.read().decode('utf-8') 
    return html
# 爬取各集网址
start_url = 'https://v.qq.com/detail/x/xbd1y6fvwl3maoz.html'
#获取网页内容
html = get_html(start_url)
# 解析
soup = BeautifulSoup(html, 'html.parser')
# css选择器
episodes = soup.select('div.mod_episode span.item a')
for episode in episodes:
    # 第几集
    eno = int(episode.get_text().strip())
    #剧集的网址
    eurl = episode['href']
    print ('{:02} {}'.format(eno, eurl))


#获取弹幕
import re
# 亲爱的热爱的第一集链接
episode_url = 'https://v.qq.com/x/cover/xbd1y6fvwl3maoz/t00313mumzy.html'
vid = re.split(r'[/.]',eurl)[-2]
print ('vid={}'.format(vid))
target_url = 'http://bullet.video.qq.com/fcgi-bin/target/regist?otype=json&vid='+vid
html = get_html(target_url)
target_id = re.search(r'(?<=targetid=)\d+',html).group()
print(target_id)
bullet_url = "http://mfm.video.qq.com/danmu?timestamp={}&target_id={}".format(0,target_id)
html = get_html(bullet_url)
import json
data = json.loads(html, strict = False)
# 保存到DataFrame
import pandas as pd
df = pd.DataFrame(columns=['commentid','content','name', 'upcount','degree','timepoint'])

for item in data['comments']:
    # 移除所有的白空格
    content = re.sub(r'"','',''.join(item['content'].split()))
    name = re.sub(r'"','',''.join(item['opername'].split()))
    upcount = item['upcount']
    degree =item['uservip_degree']
    timepoint = item['timepoint']
    commentid = item['commentid']
    row = {'commentid': commentid, 'content':content, 'name': name, 'upcount': upcount,'degree': degree, 'timepoint': timepoint }
    df = df.append(row,ignore_index=True)

df

词云制作

# 引入jieba包
import jieba

def read_stopword():
    """
    读取停用词
    """
    stopword_file = 'stopword.txt'
    stopword =[]
    with open(stopword_file) as f:
        line = f.readline().strip('\n')
        while line:
            stopword.append(line)
            line = f.readline().strip('\n')
        f.close()
    return stopword


# 分词
total_list = []
for index, row in df.iterrows():
    content = str(row['content']).strip()
    try:
        if not content:
            continue
        word_list = jieba.cut(content,cut_all=False)
        for word in word_list:
            total_list.append(word)
    except:
        print('exception occurred with content "{}"'.format(str(row['content'])))


# 绘制词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread

total_list_str = ' '.join(total_list)
#设置词云
stopwords = read_stopword()
wc = WordCloud(background_color = 'pink', #设置背景颜色
            mask = imread('xin.png'),  #设置背景图片
            max_words = 500, #设置最大显示的字数
            stopwords = stopwords, #设置停用词
            font_path = 'DroidSansFallbackFull.ttf',  # 设置为楷体 常规
            max_font_size = 60,  #设置字体最大值
            random_state = 30, #设置有多少种随机生成状态,即有多少种配色方案
    )

#生成词云
myword = wc.generate(total_list_str)
#展示词云图
fig = plt.figure(figsize=(10, 10))
plt.imshow(myword)
plt.title('word cloud')
plt.axis('off')
plt.show()
plt.close(fig)

情感趋势图制作

# 情感趋势图
from snownlp import SnowNLP

def sentiment(row):
    content = str(row['content']).strip()
    s = SnowNLP(content)
    score = float(s.sentiments)
    return score

def minute(row):
    return int(int(row['timepoint'])/60)
    
df['score'] = df.apply(sentiment, axis = 1)

df1 = df['score'].groupby(df['timepoint']).mean()

fig = plt.figure(figsize=(10, 4.5))
plt.plot(df1,lw=2)
plt.title("bullet comment trend")
plt.xlabel("time(s)")
plt.ylabel("score")
plt.ylim(0,1)
plt.axhline(0.5, color='orange')
plt.show()
plt.close(fig)

完整实例

亲爱的热爱的第一集词云及情感趋势图制作

from snownlp import SnowNLP
import re
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread
import jieba
 
# 第一集词云及情感趋势图


def plot_cloud_and_trend(input_file,max_word_count):
    eno = int(re.search(r'_(\d+)_\w+?\.txt',input_file).group(1))
    df=pd.read_csv(input_file,sep='\t',quotechar='"', header=None, usecols=[6, 7, 8], names=['segments','score','timepoint2'])
    # 设备画布
    fig = plt.figure(figsize=(10, 10), dpi = 80)
    plt.figure(1)

    # 词云
    total_list = []
    for index, row in df.iterrows():
        segments = list(row['segments'].split('_'))
        total_list +=segments
    total_list_str = ' '.join(total_list)
    # 设置词云
    stopwords = read_stopword()
    wc = WordCloud(background_color = "white", #设置背景颜色
                mask = imread('xin.png'),  #设置背景图片
                max_words = max_word_count, #设置最大显示的字数
                stopwords = stopwords, #设置停用词
                font_path = "DroidSansFallbackFull.ttf",  # 设置为楷体 常规
                max_font_size = 60,  #设置字体最大值
                random_state = 30, #设置有多少种随机生成状态,即有多少种配色方案
        )
    # 生成词云
    myword = wc.generate(total_list_str)
    # 绘制词云
    ax1=plt.subplot(2,2,1)
    plt.imshow(myword)
    plt.title('word cloud of episode {}'.format(eno))
    plt.axis("off")

    # 情感趋势图
    df1 = df['score'].groupby(df['timepoint2']).mean() 
    ax2=plt.subplot(2,1,2)
    plt.plot(df1,lw=2)
    plt.title('bullet comment of episode {}'.format(eno))
    plt.xlabel('time (min)')
    plt.ylabel('score')
    plt.ylim(0,1)
    plt.axhline(0.5, color='orange')
    plt.show()
    plt.close(fig)


input_file = 'sentiments_gogosquid_1_t00313mumzy.txt'

plot_cloud_and_trend(input_file,500)

附:文中所需资源链接: https://pan.baidu.com/s/1giMJwVoCl2S0LzMFRFHqyA 提取码: ii1m

参考资料

 

[爬虫基础](https://www.cnblogs.com/hanmk/p/8724162.html)

[pandas 中文教程](https://www.yiibai.com/pandas)

[matplotlib 中文教程](https://www.matplotlib.org.cn/tutorials/index.html)

[词云分析](https://blog.csdn.net/weixin_41782574/article/details/82259719)

[情感分析](https://blog.csdn.net/google19890102/article/details/80091502)

 

你可能感兴趣的:(数据)