爬取京东评论数据采用selenium库
爬取代码如下:
# -*- coding: utf-8 -*-
from selenium import webdriver
import json
import time
# 数据写入文本
def write_to_file(content, filepath):
with open(filepath, 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
# 根据search_key关键字爬取数据
def search_key(url):
# 开启浏览器
browser = webdriver.Chrome()
try:
# 获取京东首页
browser.get(url)
button = browser.find_element_by_xpath(
"//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']") # 获取商品评论按钮
button.click() # 控制按钮进行点击
time.sleep(5)
# 爬取数据
for n in range(100):
for i in range(3):
# 滚动条滚动到底部
browser.execute_script('window.scrollTo(0,10000)')
time.sleep(1)
user_comment = browser.find_elements_by_xpath("//p[@class='comment-con']") # 获取评论
for k in range(len(user_comment)):
item = {'user_comment': user_comment[k].text}
write_to_file(item['user_comment'], 'mate30pro.txt')
print('完成第', n+1, '页')
time.sleep(2)
button2 = browser.find_element_by_xpath('//*[@id="comment-0"]/div[13]/div/div/a[8]')
time.sleep(5)
browser.execute_script("arguments[0].click();", button2)
# 出错处理
except Exception as e:
print('exception', e)
finally:
browser.quit() # 关闭浏览器
# 显示退出
print("browser quit")
if __name__ == '__main__':
search_key('https://item.jd.com/100009177376.html')
制作云词采用wordcloud和自然语言处理jieba
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
import PIL.Image as Image
import numpy as np
import jieba
def show():
# 打开已获取的评论数据
text = open('mate30pro.txt', encoding="utf8").read()
# 打开屏蔽词文档
stop_list = [line.strip() for line in open('../images/ax.txt', 'r', encoding="utf8").readlines()]
# 去屏蔽词
for stop in stop_list:
jieba.del_word(stop)
# 自然语言处理
jc = jieba.cut(text, cut_all=False)
# 分词
items_keys = " ".join(jc)
# 导入图片
coloring = np.array(Image.open("../images/10002.png"))
# 绘制词云
my_word_cloud = WordCloud(background_color="white", # 背景颜色:白
font_path="../images/f001.ttf", # 字体
max_words=400, # 最大词数量
max_font_size=150, # 最大字体
min_font_size=2, # 最小字体
mask=coloring,
# height=911, # 高
# width=1080, # 宽
scale=1).generate(items_keys)
image_colors = ImageColorGenerator(coloring)
image_0003 = my_word_cloud.recolor(color_func=image_colors)
plt.imshow(image_0003)
plt.imshow(my_word_cloud)
plt.savefig('../images/10010.png') # 保存
plt.show()
if __name__ == '__main__':
show()