import jieba
from jieba.analyse import extract_tags
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def generater(**kwargs):
content_name = kwargs['content_name']
content_path = './{}.txt'.format(content_name)
top_k = kwargs['top_k']
bg_name = kwargs['bg_name']
color = kwargs['color']
font_type = kwargs['font_type']
with open(content_path, mode='r', encoding='utf-8') as f:
content = f.read()
'''
根据TF/IDF提取topK个关键词
'''
tags = extract_tags(sentence=content, topK=top_k)
'''
得到关键词的词频
'''
# 全模式
words = [word for word in jieba.cut(content, cut_all=True)]
words_freq = {}
for tag in tags:
freq = words.count(tag)
words_freq[tag] = freq
'''
设置背景
scipy.misc imread():返回的是 numpy.ndarray 也即 numpy 下的多维数组对象
'''
bg_path = './{}.png'.format(bg_name)
bg_img = imread(bg_path)
font_path = './{}.ttf'.format(font_type)
word_cloud = WordCloud(font_path=font_path, # 设置字体
background_color=color, # 背景颜色
max_words=top_k, # 词云显示的最多词数
max_font_size=100, # 字体最大
mask=bg_img, # 背景图
)
word_cloud.generate_from_frequencies(words_freq)
plt.imshow(word_cloud)
plt.axis('off') # 不显示坐标轴
plt.show()
# 保存图片
word_cloud_img = './{}_word_cloud.jpg'.format(content_name)
word_cloud.to_file(word_cloud_img)
if __name__ == '__main__':
generater(content_name='习近平:在庆祝中国共产党成立95周年大会上的讲话',
top_k=66,
bg_name='china', # 默认png
color='black',
font_type='wryh')
链接: https://pan.baidu.com/s/1bp8dPa3密码: ujxc
windows 安装 wordcloud
下载地址
pip install wordcloud-1.3.1-cp36-cp36m-win_amd64.whl