python爬取复仇者联盟3豆瓣影评

from urllib import request
from bs4 import BeautifulSoup as bs

for i in range (0,11):
    res = request.urlopen('https://movie.douban.com/subject/24773958/comments?start=' + str(20*i) + '&limit=20&sort=new_score&status=P&percent_type=')
    html_data = res.read().decode('utf-8')

    Soup = bs(html_data, 'html.parser')
    comments = Soup.find_all('div', id='comments')
    comments_content = comments[0].find_all('p')
    for j in range(0, 20):
        text = str(comments_content[j])
        f = open('movie_comments.txt', 'a', encoding='utf-8')
        f.write(text)
        f.close()

将爬取的内容保存为txt,使用结巴分词包对评论进行词频统计

import re
import jieba
import pandas as pd
import numpy as np
from scipy.misc import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud  # 词云包
import matplotlib


f = open('movie_comments.txt', 'r', encoding='utf-8')
content = f.read()
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, content)
# print(filterdata)
cleaned_comments = ''.join(filterdata)
segment = jieba.lcut(cleaned_comments)
# print(segment)
words_df = pd.DataFrame({'segment':segment})
# print(words_df.head())
stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='gbk')#quoting=3全不引用
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
# print(words_df.head())
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":np.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
print(words_stat.head())
print(words_stat)

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)  # 指定字体类型、字体大小和字体颜色
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
    temp = (key, word_frequence[key])
    word_frequence_list.append(temp)

wordcloud = wordcloud.fit_words(dict(word_frequence_list))
plt.imshow(wordcloud)
plt.show()
f.close()

最终结果如图所示

python爬取复仇者联盟3豆瓣影评_第1张图片

你可能感兴趣的:(python爬取复仇者联盟3豆瓣影评)