上次爬了《双城之战》的视频弹幕,效果很不理想,这次的目标是豆瓣的评论数据
import requests
import parsel
url = f'https://movie.douban.com/subject/34867871/comments?start=20&limit=20&status=P&sort=new_score'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
select = parsel.Selector(response.text)
content_list = select.css('.short::text').getall()
for page in range(0, 220, 20):
url = f'https://movie.douban.com/subject/34867871/comments?start={
page}&limit=20&status=P&sort=new_score'
for index in content_list:
with open('双城之战.txt', mode='a', encoding='utf-8') as f:
f.write(index)
f.write('\n')
print(index)
import jieba
import wordcloud
import imageio
py = imageio.imread(r"C:\Users\Administrator\Desktop\123.png")
f = open(r'C:\Users\Administrator\Desktop\双城之战.txt', encoding='utf-8')
txt = f.read()
txt_list = jieba.lcut(txt)
string = ' '.join(txt_list)
wc = wordcloud.WordCloud(
width=1000, # 图片的宽
height=700, # 图片的高
background_color='white', # 图片背景颜色
font_path='STKAITI.TTF', # 词云字体
mask=py, # 所使用的词云图片
scale=5,
)
wc.generate(string)
wc.to_file(r'C:\Users\Administrator\Desktop\out.png')