三个步骤:分割——统计词频——绘制词云
# coding: utf-8
'''''
词云
'''
import numpy as np
import pandas as pd
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
import jieba
#分割语料
def wordcut(filename,tofilename):
filepath1 = open(filename, encoding="utf-8-sig").read().split('\n') #一行行读取要生成词云的文件
stopwords = {
}.fromkeys([line.rstrip() for line in open("stopwords.txt", encoding="utf-8")]) #打开停用词表
s_content = [] #建立存储分词的列表
for i in range (len(filepath1)-1):
result = []
seg_list = jieba.cut(filepath1[i])#对每一行进行分词
for w in seg_list:
if w not in stopwords and w != '':#去除停用词
result.append(w)
s_content.append(result)
s_contents = pd.DataFrame({
's.clean':s_content})
print(s_contents.head())
#统计词频
wc_dict = {
}
for i in range (len(filepath1)-1):
for word in s_content[i]:
if len(word) == 1:continue
else:
wc_dict[word] = wc_dict.get(word,0)+1
# print(wc_dict)
df = pd.DataFrame(pd.Series(wc_dict), columns=['val'])
df = df.reset_index().rename(columns={
'index':'name'})
df.to_csv(tofilename, mode='w', encoding= 'utf_8_sig')#输出词频表
#根据词频绘制词云
def draw_cloud(read_name):
image = Image.open('beijing.jpg') #读取背景图
graph = np.array(image)
# 参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状
wc = WordCloud(font_path='simkai.ttf', background_color="white", max_words=100, mask=graph,prefer_horizontal=1)
fp = pd.read_csv(read_name, encoding='utf-8') #读取词频表
name = list(fp.name)
value = fp.val
for i in range(len(name)):
name[i] = str(name[i])
dic = dict(zip(name, value))
wc.generate_from_frequencies(dic)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.axis("off") # 不显示坐标轴
plt.show()
wc.to_file('wc-3.png') # 词云图命名
if __name__ == '__main__':
name1 = 'teacher_chat.csv'#要做词云的文件名
name2 = 'wcdict.csv'#词频表的文件名
wordcut(name1,name2)
draw_cloud(name2)