爬取京东华为mate30Pro5G的评论数据并制作云词

爬取京东评论数据采用selenium库
爬取代码如下:

# -*- coding: utf-8 -*-

from selenium import webdriver
import json
import time


# 数据写入文本
def write_to_file(content, filepath):
    with open(filepath, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


# 根据search_key关键字爬取数据
def search_key(url):
    # 开启浏览器
    browser = webdriver.Chrome()
    try:
        # 获取京东首页
        browser.get(url)
        button = browser.find_element_by_xpath(
            "//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")  # 获取商品评论按钮
        button.click()  # 控制按钮进行点击
        time.sleep(5)

        # 爬取数据
        for n in range(100):
            for i in range(3):
                # 滚动条滚动到底部
                browser.execute_script('window.scrollTo(0,10000)')
                time.sleep(1)
            user_comment = browser.find_elements_by_xpath("//p[@class='comment-con']")  # 获取评论
            for k in range(len(user_comment)):
                item = {'user_comment': user_comment[k].text}
                write_to_file(item['user_comment'], 'mate30pro.txt')
            print('完成第', n+1, '页')
            time.sleep(2)
            button2 = browser.find_element_by_xpath('//*[@id="comment-0"]/div[13]/div/div/a[8]')
            time.sleep(5)
            browser.execute_script("arguments[0].click();", button2)

    # 出错处理
    except Exception as e:
        print('exception', e)
    finally:
        browser.quit()                           # 关闭浏览器
        # 显示退出
        print("browser quit")


if __name__ == '__main__':
    search_key('https://item.jd.com/100009177376.html')

制作云词采用wordcloud和自然语言处理jieba

# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
import PIL.Image as Image
import numpy as np
import jieba


def show():
    # 打开已获取的评论数据
    text = open('mate30pro.txt', encoding="utf8").read()
    # 打开屏蔽词文档
    stop_list = [line.strip() for line in open('../images/ax.txt', 'r', encoding="utf8").readlines()]
    # 去屏蔽词
    for stop in stop_list:
        jieba.del_word(stop)
    # 自然语言处理
    jc = jieba.cut(text, cut_all=False)
    # 分词
    items_keys = " ".join(jc)
    # 导入图片
    coloring = np.array(Image.open("../images/10002.png"))
    # 绘制词云
    my_word_cloud = WordCloud(background_color="white",     # 背景颜色:白
                              font_path="../images/f001.ttf",       # 字体
                              max_words=400,                # 最大词数量
                              max_font_size=150,            # 最大字体
                              min_font_size=2,              # 最小字体
                              mask=coloring,
                              # height=911,                 # 高
                              # width=1080,                 # 宽
                              scale=1).generate(items_keys)
    image_colors = ImageColorGenerator(coloring)
    image_0003 = my_word_cloud.recolor(color_func=image_colors)
    plt.imshow(image_0003)
    plt.imshow(my_word_cloud)
    plt.savefig('../images/10010.png')                      # 保存
    plt.show()


if __name__ == '__main__':
    show()

结果:
爬取京东华为mate30Pro5G的评论数据并制作云词_第1张图片

你可能感兴趣的:(Python,#爬虫)