from urllib import request
from bs4 import BeautifulSoup
def get_html(url):
req = request.urlopen(url)
html = req.read().decode('utf-8')
return html
# 爬取各集网址
start_url = 'https://v.qq.com/detail/x/xbd1y6fvwl3maoz.html'
#获取网页内容
html = get_html(start_url)
# 解析
soup = BeautifulSoup(html, 'html.parser')
# css选择器
episodes = soup.select('div.mod_episode span.item a')
for episode in episodes:
# 第几集
eno = int(episode.get_text().strip())
#剧集的网址
eurl = episode['href']
print ('{:02} {}'.format(eno, eurl))
#获取弹幕
import re
# 亲爱的热爱的第一集链接
episode_url = 'https://v.qq.com/x/cover/xbd1y6fvwl3maoz/t00313mumzy.html'
vid = re.split(r'[/.]',eurl)[-2]
print ('vid={}'.format(vid))
target_url = 'http://bullet.video.qq.com/fcgi-bin/target/regist?otype=json&vid='+vid
html = get_html(target_url)
target_id = re.search(r'(?<=targetid=)\d+',html).group()
print(target_id)
bullet_url = "http://mfm.video.qq.com/danmu?timestamp={}&target_id={}".format(0,target_id)
html = get_html(bullet_url)
import json
data = json.loads(html, strict = False)
# 保存到DataFrame
import pandas as pd
df = pd.DataFrame(columns=['commentid','content','name', 'upcount','degree','timepoint'])
for item in data['comments']:
# 移除所有的白空格
content = re.sub(r'"','',''.join(item['content'].split()))
name = re.sub(r'"','',''.join(item['opername'].split()))
upcount = item['upcount']
degree =item['uservip_degree']
timepoint = item['timepoint']
commentid = item['commentid']
row = {'commentid': commentid, 'content':content, 'name': name, 'upcount': upcount,'degree': degree, 'timepoint': timepoint }
df = df.append(row,ignore_index=True)
df
# 引入jieba包
import jieba
def read_stopword():
"""
读取停用词
"""
stopword_file = 'stopword.txt'
stopword =[]
with open(stopword_file) as f:
line = f.readline().strip('\n')
while line:
stopword.append(line)
line = f.readline().strip('\n')
f.close()
return stopword
# 分词
total_list = []
for index, row in df.iterrows():
content = str(row['content']).strip()
try:
if not content:
continue
word_list = jieba.cut(content,cut_all=False)
for word in word_list:
total_list.append(word)
except:
print('exception occurred with content "{}"'.format(str(row['content'])))
# 绘制词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread
total_list_str = ' '.join(total_list)
#设置词云
stopwords = read_stopword()
wc = WordCloud(background_color = 'pink', #设置背景颜色
mask = imread('xin.png'), #设置背景图片
max_words = 500, #设置最大显示的字数
stopwords = stopwords, #设置停用词
font_path = 'DroidSansFallbackFull.ttf', # 设置为楷体 常规
max_font_size = 60, #设置字体最大值
random_state = 30, #设置有多少种随机生成状态,即有多少种配色方案
)
#生成词云
myword = wc.generate(total_list_str)
#展示词云图
fig = plt.figure(figsize=(10, 10))
plt.imshow(myword)
plt.title('word cloud')
plt.axis('off')
plt.show()
plt.close(fig)
# 情感趋势图
from snownlp import SnowNLP
def sentiment(row):
content = str(row['content']).strip()
s = SnowNLP(content)
score = float(s.sentiments)
return score
def minute(row):
return int(int(row['timepoint'])/60)
df['score'] = df.apply(sentiment, axis = 1)
df1 = df['score'].groupby(df['timepoint']).mean()
fig = plt.figure(figsize=(10, 4.5))
plt.plot(df1,lw=2)
plt.title("bullet comment trend")
plt.xlabel("time(s)")
plt.ylabel("score")
plt.ylim(0,1)
plt.axhline(0.5, color='orange')
plt.show()
plt.close(fig)
《亲爱的热爱的》第一集词云及情感趋势图制作
from snownlp import SnowNLP
import re
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread
import jieba
# 第一集词云及情感趋势图
def plot_cloud_and_trend(input_file,max_word_count):
eno = int(re.search(r'_(\d+)_\w+?\.txt',input_file).group(1))
df=pd.read_csv(input_file,sep='\t',quotechar='"', header=None, usecols=[6, 7, 8], names=['segments','score','timepoint2'])
# 设备画布
fig = plt.figure(figsize=(10, 10), dpi = 80)
plt.figure(1)
# 词云
total_list = []
for index, row in df.iterrows():
segments = list(row['segments'].split('_'))
total_list +=segments
total_list_str = ' '.join(total_list)
# 设置词云
stopwords = read_stopword()
wc = WordCloud(background_color = "white", #设置背景颜色
mask = imread('xin.png'), #设置背景图片
max_words = max_word_count, #设置最大显示的字数
stopwords = stopwords, #设置停用词
font_path = "DroidSansFallbackFull.ttf", # 设置为楷体 常规
max_font_size = 60, #设置字体最大值
random_state = 30, #设置有多少种随机生成状态,即有多少种配色方案
)
# 生成词云
myword = wc.generate(total_list_str)
# 绘制词云
ax1=plt.subplot(2,2,1)
plt.imshow(myword)
plt.title('word cloud of episode {}'.format(eno))
plt.axis("off")
# 情感趋势图
df1 = df['score'].groupby(df['timepoint2']).mean()
ax2=plt.subplot(2,1,2)
plt.plot(df1,lw=2)
plt.title('bullet comment of episode {}'.format(eno))
plt.xlabel('time (min)')
plt.ylabel('score')
plt.ylim(0,1)
plt.axhline(0.5, color='orange')
plt.show()
plt.close(fig)
input_file = 'sentiments_gogosquid_1_t00313mumzy.txt'
plot_cloud_and_trend(input_file,500)
附:文中所需资源链接: https://pan.baidu.com/s/1giMJwVoCl2S0LzMFRFHqyA 提取码: ii1m
•[爬虫基础](https://www.cnblogs.com/hanmk/p/8724162.html)
•[pandas 中文教程](https://www.yiibai.com/pandas)
•[matplotlib 中文教程](https://www.matplotlib.org.cn/tutorials/index.html)
•[词云分析](https://blog.csdn.net/weixin_41782574/article/details/82259719)
•[情感分析](https://blog.csdn.net/google19890102/article/details/80091502)