使用中文制作词云图---

'''
制作词云图,背景可以替换成任意图片,本例中未展示图片
'''
import numpy as np
import pandas as pd
from wordcloud import WordCloud    #词云包
import jieba                       #中文分词包
import codecs                      #提供的open方法来指定打开的文件的语言编码,在读取的时候自动转换为内部unicode
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0,5.5)          #绘制出的图大小


'''
读入数据
'''
df = pd.read_csv('H:/NLP_project/NLP_project/data/entertainment_news.csv')

'''
数据处理
'''
#数据切分
df = df.dropna()
content = df.content.values.tolist()                #转为list
segment = []
for line in content:
    try:
        segs = jieba.lcut(line)                     #利用jieba进行文本切分
        for seg in segs:
            if len(seg)>1 and seg!='\r\n':        #当元素不为空且不是换行符等,将其加入segment
                segment.append(seg)
    except:
        print(line)
        continue

'''
去除停用词
'''
stopwords = pd.read_csv('H:/NLP_project/NLP_project/data/stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'])
words_df = pd.DataFrame({'segment':segment})
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

'''
词频统计
'''
word_start = words_df.groupby(by=['segment'])['segment'].agg({"计数":np.size})        #按照segment,agg聚合
word_start = word_start.reset_index().sort_values(by=["计数"],ascending=False)

'''
做词云
'''
wordcloud = WordCloud(font_path="H:/NLP_project/NLP_project/data/simhei.ttf",background_color="black",max_font_size=80)
word_frequence = {x[0]:x[1] for x in word_start.head(1000).values}
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.show()

你可能感兴趣的:(NLP)