直接上代码:
爬取优酷网站弹幕:
# -*- coding: utf-8 -*-
from my_fake_useragent import UserAgent
from requests.exceptions import RequestException
from tqdm import tqdm
import requests
import time
import os
import re
def get_data(mat):
"""
循环遍历爬取弹幕数据
:param mat: 偏移量
:return: list
"""
# 请求链接
url = 'https://service.danmu.youku.com/list?jsoncallback=jQuery111207035726936412456_1552483671572&mat={}&mcount=1&ct=1001&iid=959955945&aid=333822&cid=96&lid=0&ouid=0'.format(mat)
# headers
headers = {
'Referer': 'https://v.youku.com/v_show/id_XMzgzOTgyMzc4MA==.html?spm=a2h0k.11417342.soresults.dplaybutton&s=c6c62a475a5d4a14ab48',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
"""
# 参数
params = {
'jsoncallback': 'jQuery11120003560802190473389_1552479833762',
'mat': mat,
'mcount': '1',
'ct': '1001',
'id': '959955945',
'aid': '333822',
'cid': '96',
'lid': '0',
'ouid': '0'
# '_': '1552479833815' 提示:类似时间戳,去掉后不影响数据的获取
}
"""
# 获取弹幕
try:
response = requests.get(url, headers=headers)
print(response)
if response.status_code == 200:
html = response.text
# 正则解析(结果为list类型)
results = re.findall(',\"content\":\"(.*?)\",', html, re.S)
# 文本存储
save_dir = 'D:/python files/tanmu.txt'
if not os.path.exists(save_dir): # Determine whether storage path exists, no creation
os.mkdir(save_dir)
with open(save_dir, 'a', encoding='utf-8') as f:
f.write(str(results))
return results
return None
except RequestException as e:
print('Error: ', e.args)
return None
if __name__ == '__main__':
for i in tqdm(range(10), desc='Progress'):
time.sleep(1)
get_data(str(i))
生成词云:
# coding: utf-8
import jieba
from scipy.misc import imread # 这是一个处理图像的函数
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
back_color = imread('show.jpg') # 解析该图片
wc = WordCloud(background_color='white', # 背景颜色
max_words=1000, # 最大词数
mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
max_font_size=100, # 显示字体的最大值
stopwords=STOPWORDS.add('哇卡拉'), # 使用内置的屏蔽词,再添加 '哇卡拉'
font_path=r'.\simhei.ttf', # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
#此目录下必须要有对应的ttf文件,否则报错:OSError: cannot open resource
random_state=42, # 为每个词返回一个PIL颜色
# width=1000, # 图片的宽
# height=860 #图片的长
)
# 添加自己的词库分词,比如添加'你知道难道别人不知道'到jieba词库后,当你处理的文本中含有这个词时,
# 就会直接将其当作一个词,而不会得到'知道'或'不知道'这样的词
jieba.add_word('你知道难道别人不知道')
# 打开词源的文本文件
text = open('tanmu.txt').read()
# 该函数的作用就是把屏蔽词去掉,使用这个函数就不用在WordCloud参数中添加stopwords参数了
# 把你需要屏蔽的词全部放入一个stopwords文本文件里即可
def stop_words(texts):
words_list = []
word_generator = jieba.cut(texts, cut_all=False) # 返回的是一个迭代器
with open('stopwords.txt') as f:
str_text = f.read()
unicode_text = unicode(str_text, 'utf-8') # 把str格式转成unicode格式
f.close() # stopwords文本中词的格式是'一词一行'
for word in word_generator:
if word.strip() not in unicode_text:
words_list.append(word)
return ' '.join(words_list) # 注意是空格
text = stop_words(text)
wc.generate(text)
# 基于彩色图像生成相应彩色
image_colors = ImageColorGenerator(back_color)
# 显示图片
plt.imshow(wc)
# 关闭坐标轴
plt.axis('off')
# 绘制词云
plt.figure()
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis('off')
# 保存图片
wc.to_file('xixixi.png')
查询后了解 from scipy.misc import imread,imwrite
等被弃用
解决方法:
安装 imageio
库
pip install imageio
import imageio
imageio.imread("xxxx.png")
另外关于scipy库可参考:
https://www.cnblogs.com/mrchige/p/6504324.html