《进击的巨人第三季part2》爬取豆瓣网友评论,制作词云图

莴苣第三季part2实在是太棒了!!!制作和配音都绝了,看起来啊!!尤其《白夜》真的是神回!

先放个结果

《进击的巨人第三季part2》爬取豆瓣网友评论,制作词云图_第1张图片

自动翻页和下载评论部分的代码:

def douban_page_generator(i):
    new_url =[]
    for i in range(20,i+20,20):
        new = 'https://movie.douban.com/subject/30353357/comments?start=' + str(i)+ '&limit=20&sort=new_score&status=P'
        new_url.append(new)
    return new_url

#进入一个新页面后下载评论
def get_page_reviews(link):
    reviews = []
    votes =[]
    browser = webdriver.Chrome(
         executable_path='/Users/wang/mmodule/lib/python3.6/site-packages/selenium/webdriver/chrome/chromedriver')
    browser.get(link)
    for i in range(1, 21):
           review = browser.find_element_by_xpath(
            '/html/body/div[3]/div[1]/div/div[1]/div[4]/div[{}]/div[2]/p/span'.format(str(i))).text
           vote = browser.find_element_by_xpath(
            '/html/body/div[3]/div[1]/div/div[1]/div[4]/div[{}]/div[2]/h3/span[1]/span'.format(str(i))).text
           reviews.append(review)
           votes.append(vote)
    data = {
        'reviews': reviews,
        'votes': votes,
    }

    return data

制作词云:

#制作词云


import matplotlib.pyplot as plt
import jieba  #中文词云
from wordcloud import WordCloud
import sys
import xlrd



def return_final_dic():
    #文件名一定要加上绝对路径
    data = xlrd.open_workbook(r'/Users/wang/Desktop/attack on titan.xlsx')
    table = data.sheet_by_index(0)

    # 获得工作表的非空行数
    nrows = table.nrows
    print('一共有 '+str(nrows)+' 个非空行')

    with open(r'/Users/wang/Desktop/stopwords.txt', 'r+', encoding='utf-8') as f:
        stop_word = f.read().split('\n') 
        print('成功获取停用词')

    #从第二行开始读取到最后一行的循环
    #对每一条评论而言
    final_dic = {}
    for row in range(1, nrows):
        sent = table.cell_value(rowx= row ,colx=1)
        votes = table.cell_value(rowx= row ,colx=2)
        print(sent + ':  ' + str(votes))
 
        seg_list = jieba.cut(sent, cut_all=False)
        #print('精确模式:', ''.join(seg_list))

        for key in seg_list:#seg_list 是一个generator
            #if not(key.strip() in stopword) and (len(key.strip()) > 1) and not(key.strip() in wordlist) :
            if not(key) in stop_word and not len(key)<2:
                print(key)
                final_dic[str(key)] = float(votes)
        print()


    return final_dic

def create_cloud(dic):
    cloud = WordCloud(font_path=r'/Users/wang/Downloads/Noto_Sans_SC/NotoSansSC-Light.otf', background_color='white',max_font_size=80)
    cloud.generate_from_frequencies(dic)
    plt.imshow(cloud)
    plt.axis('off')
    plt.show()
    cloud.to_file(r'/Users/wang/Desktop/aot word cloud.jpg')



final = return_final_dic()
print(final)
create_cloud(final)

 

你可能感兴趣的:(Python)