完成目标:
获取评论并制作词云
毕竟会在烟台待上三年,先了解了解烟台这个地方。
编辑器:pycharm
用到的库:requests、wordcloud、jieba
爬取去哪儿网关于蓬莱岛的游客评论的前20页
def save_comment():
fp = open("comment.txt", mode="w", encoding="utf-8")
for num in range(1, 20):
url = 'https://piao.qunar.com/ticket/detailLight/sightCommentList.json?sightId=3827&index=' + str(
num) + '&page=' + str(num) + '&pageSize=10&tagType=0'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.38',
"accept": "application/json, text/javascript, */*; q=0.01",
}
response = requests.get(url=url, headers=headers)
try:
commentList = response.json()['data']['commentList']
for comment in commentList:
# content.append()
fp.write(comment['content'] + "\n")
except Exception as e:
pass
fp.close()
读取评论并制作词云
def stopwordslist(): # 停用词列表
stopwords = [line.strip() for line in open('Chinesestopword.txt', encoding='UTF-8').readlines()]
return stopwords
def get_wcd():
fp = open("comment.txt", "r", encoding="utf-8")
data = fp.read()
stopwords=stopwordslist()
data_list = jieba.lcut(data)
data_list = " ".join(data_list)
outstr = ''
for word in data_list:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
wcd = wordcloud.WordCloud(
font_path="simkai.ttf"
, colormap="brg"
, width=800
, height=400
, max_words=200
, background_color="white"
, scale=16
).generate(outstr)
wcd.to_file('comment.jpg')
还是不错滴,有机会去~~~