思路:
待续
效果:
代码:
# -*- coding: utf-8 -*-
# @Time : 2019/9/13 20:45
# @Author : Zhao HL
# @File : jackMa.py
'''
cid获取方法:
heartbeat-->Form Data-->cid
'''
import requests
import xml.sax
import xml.sax.handler
import pprint
import xml.etree.ElementTree as Etree
import jieba
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
xml_url = 'http://comment.bilibili.com/116788897.xml'
xml_savePath = 'jackMa.txt'
# pic_path= 'jackMa.jpg'
pic_path= 'jackMa.png'
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
#print(r.status_code)
return r.text
except:
return "产生异常"
def xmlParse(xml):
root = Etree.fromstring(xml)
barrages = root.findall('d')
with open('jackMa.txt', "w",encoding='utf-8') as f:
for barrage in barrages:
f.write(barrage.text+'\n')
def get_cloud():
f = open(xml_savePath, 'r', encoding='UTF-8').read()
cut_text = " ".join(jieba.cut(f))
# print(cut_text)
alice_mask = np.array(Image.open(pic_path))
stopwords = set(STOPWORDS)
stopwords.add("said")
wordcloud = WordCloud(
# 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
# 设置了背景,宽高
background_color="white", width=1000, height=880,mask=alice_mask,stopwords=stopwords).generate(cut_text)
wordcloud.generate(cut_text)
wordcloud.to_file(r"wordcloud.png")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
# txt_xml = getHTMLText(xml_url)
# print(txt_xml)
# xmlParse(txt_xml)
get_cloud()