from collections import Counter
import jieba
#jieba的安装就不多介绍,网上相应的教程比较多
import matplotlib.pyplot as plt
from wordcloud import WordCloud
#wordcloud安装出现了bug,解决的方案就是另外一篇的blog
Windows环境下Python中wordcloud的使用
http://blog.csdn.net/qq_35273499/article/details/79078692
##创建停用词list
#此处分词之后去除多余的停用词def Stopwordlist(filepath):
stopwords = []
for line in open(filepath,'r').readlines():
stopwords.append( line.strip())
#print(stopwords)
return stopwords
##对句子进行分词
def Cut_Sentence(rawfile,stopwordpath):
outstr = []
stopwords = Stopwordlist(stopwordpath)
for line in rawfile:
sentence_seged = jieba.cut(line.strip(),cut_all = False)#line.strip()去除换行符
for word in sentence_seged:
if word not in stopwords:
if word!= '\t':
outstr.append(word)
#print(outstr)
return outstr
#词频统计
def Countword(outstrlist):
data = dict(Counter(outstrlist))
data1 = sorted(data.items(), key=lambda d:d[1], reverse = True)
data2 = dict((key ,values)for key,values in data1)
'''
data.iteritems() 得到[(键,值)]的列表。
然后用sorted方法,通过key这个参数,指定排序是按照value,
也就是第一个元素d[1的值来排序。reverse = True表示是需要翻转的,
默认是从小到大,翻转的话,那就是从大到小。'''
return data2
#制作词云图
def Wordcloud(text):
wc = WordCloud(
background_color = 'white', # 设置背景颜色
max_words = 2000, # 设置最大现实的字数
font_path = r'H:\cutword\msyhbd.ttf',# 设置字体格式,如不设置显示不了中文
# mask = trump_coloring,
width =800,
height = 600,
max_font_size = 50, # 设置字体最大值
random_state = 30, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate(text)
wc.to_file('H:\cutword\wordcloud.png')
#my_wordcloud = WordCloud().generate(str(strlist))
plt.imshow(wc)
plt.axis("off")#不添加坐标轴
plt.show()
if __name__ == '__main__':
stopwordpath ='H:\cutword\stopwords.txt'#停用词路径
rawfile = open('H:\cutword\dqdg\dqdg.txt','r')#需要进行分词的原文本
outfile = open("H:\cutword\outwords.txt",'w+')#分词并进行去除停用词后的文本
countfile = open('H:\cutword\wordcount.txt','w')#将词频统计写入文件中
outstrlist = Cut_Sentence(rawfile,stopwordpath)
countword = Countword(outstrlist)
Wordcloud(str(outstrlist))
for line in outstrlist:
outfile.write(line + " ")#将分词结果写入文件
for key in countword.keys():
countfile.write(key + ' ' + str(countword[key]) + '\n') #写入txt文档
#Wordcloud()
rawfile.close()
outfile.close()
countfile.close()
进行分词的预料是孙皓晖的《大秦帝国》:https://pan.baidu.com/s/1o94kRGY 密码:2q2t
停用的词的部分截图 https://pan.baidu.com/s/1dGMeivn
最终分词的效果
进行分词之后,我们还进行了词频的统计,按词频从大到小进行排序并将结果保存在文件中。如下图
所需的字体(若无字体,无法识别中文):https://pan.baidu.com/s/1oAj2wJ4 密码:mq2a
将部分词制作词云图,最终的结果:可以看到,‘秦国’,‘卫鞅,‘庞涓’,‘商鞅’,‘国君’等出现比较高
可以参考的结巴分词总结博客:https://blog.csdn.net/hhtnan/article/details/76586693