1、掌握结巴分词,增加不在jieba的新词,剔除停用词,分词后只保留词性为n的词;
2、利用Counter函数统计文档的词语频次;
3、安装wordcloud,制作词云图。
#********* 步骤一 ********#
#获取当前路径
import os
cwd=os.getcwd()
Data_Folder=cwd+'\Demo5Files'
#走访文件
from os import walk
from os.path import join
file_list=[]
for root,dirs,files in walk(Data_Folder):
for file in files:
file=join(root,file)
file_list.append(file)
#读取文档内容
import codecs
all_news=[]
category=[]
for file in file_list:
with codecs.open(file,'r',encoding='utf-8') as news:
all_news.append(news.read())
category.append(file.split('\\')[-2])
#********* 步骤二 ********#
import jieba
import jieba.posseg as posseg
#导入停用词
## load stop words ##
stop_words_path=cwd+'\\stop_words.txt'
stop_words=set()
with open(stop_words_path,'r',encoding='utf-8') as sw:
for line in sw.readlines():
stop_words.add(line.strip())
stop_words.add('说')
stop_words.add('中')
stop_words.add('「')
#增加不在jiaba的新词
## load user dictionary ##
jieba.load_userdict(cwd+'\\userdict_p.txt')
#分词,且只保留名词
## word segmentation ##
word_seg_list=[]
for i in range(len(all_news)):
word=[]
line=all_news[i]
result=posseg.cut(str(line))
for w,s in result:
if not w in stop_words and s[0]=='n':
#print(w+'/'+s,end=' ')
word.append(w)
word_seg_list.append(' '.join(word))
#********* 步骤三 ********#
list1=[] #国际
list2=[] #娱乐
list3=[] #社会
for i in range(len(category)):
if category[i]=='国际':
list1.append(word_seg_list[i])
elif category[i]=='娱乐':
list2.append(word_seg_list[i])
else: list3.append(word_seg_list[i])
# pip install wordcloud
from wordcloud import WordCloud,ImageColorGenerator
from collections import Counter
import matplotlib.pyplot as plt
terms=[]
for i in list1:
terms.extend(i.split(' ')) #批量增加
my_wordcloud = WordCloud(background_color="white", font_path='msjh.ttf', max_words = 100, collocations=False, margin=2)
my_wordcloud.generate_from_frequencies(Counter(terms))
plt.figure(figsize=(20,10),facecolor='k')
plt.imshow(my_wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()