'''
抓取豆瓣电影某部电影的评论
这里以《我不是潘金莲为例》
网址链接:https://movie.douban.com/subject/26630781/comments
为了抓取全部评论需要先进行登录
'''
from selenium import webdriver
import time
import codecs
import jieba
import jieba.analyse as analyse
from wordcloud import WordCloud
from scipy.misc import imread
from os import path
def get_douban_comments(url):
comments_list = []
login_url = 'https://accounts.douban.com/login?source=movie'
user_name = 'aaa'
password = 'aaa'
driver = webdriver.Firefox()
driver.get(login_url)
driver.find_element_by_id('email').clear()
driver.find_element_by_id('email').send_keys(user_name)
driver.find_element_by_id('password').clear()
driver.find_element_by_id('password').send_keys(password)
captcha_field = raw_input('请打开浏览器输入验证码:')
driver.find_element_by_id('captcha_field').send_keys(captcha_field)
driver.find_element_by_class_name('btn-submit').click()
time.sleep(5)
driver.get(url)
driver.implicitly_wait(3)
n = 1
count = 0
i = 50
while True:
try:
results = driver.find_elements_by_class_name('comment')
for result in results:
comment = result.find_element_by_tag_name('p').text
comments_list.append(comment + u'\n')
print u"查找到第%d个评论" % count
count += 1
driver.find_element_by_class_name('next').click()
print u'第%d页查找完毕!' % n
n += 1
time.sleep(4)
i -= 1
print i
if(i == 0):
break
except Exception, e:
print e
break
with codecs.open('pjl_comment.txt', 'a', encoding='utf-8') as f:
f.writelines(comments_list)
print u"查找到第%d页,第%d个评论!" % (n, count)
def get_all_keywords(file_name):
word_lists = []
with codecs.open(file_name, 'r', encoding='utf-8') as f:
Lists = f.readlines()
for List in Lists:
cut_list = list(jieba.cut(List))
for word in cut_list:
word_lists.append(word)
word_lists_set = set(word_lists)
sort_count = []
word_lists_set = list(word_lists_set)
length = len(word_lists_set)
print u"共有%d个关键词" % length
k = 1
for w in word_lists_set:
sort_count.append(w + u':' + unicode(word_lists.count(w)) + u"次\n")
print u"%d---" % k + w + u":" + unicode(word_lists.count(w)) + u"次"
k += 1
with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
f.writelines(sort_count)
def get_top_keywords(file_name):
top_word_lists = []
with codecs.open(file_name, 'r', encoding='utf-8') as f:
texts = f.read()
Result = analyse.textrank(
texts, topK=20, withWeight=True, withFlag=True)
n = 1
for result in Result:
print u"%d:" % n,
for C in result[0]:
print C, u" ",
print u"权重:" + unicode(result[1])
n += 1
def draw_wordcloud():
with codecs.open('pjl_comment.txt', encoding='utf-8') as f:
comment_text = f.read()
cut_text = " ".join(jieba.cut(comment_text))
d = path.dirname(__file__)
color_mask = imread("alice_color.png")
cloud = WordCloud(font_path=path.join(d, 'simsun.ttc'), background_color='white',
mask=color_mask, max_words=2000, max_font_size=40)
word_cloud = cloud.generate(cut_text)
word_cloud.to_file("pjl_cloud.jpg")
if __name__ == '__main__':
draw_wordcloud()