Python文本挖掘练习(四)// 词云图

一、练习目标

1、掌握结巴分词,增加不在jieba的新词,剔除停用词,分词后只保留词性为n的词;
2、利用Counter函数统计文档的词语频次;
3、安装wordcloud,制作词云图。

二、步骤与代码

#********* 步骤一 ********#
#获取当前路径
import os

cwd=os.getcwd()
Data_Folder=cwd+'\Demo5Files'

#走访文件
from os import walk
from os.path import join

file_list=[]
for root,dirs,files in walk(Data_Folder):
    for file in files:
        file=join(root,file)
        file_list.append(file)

#读取文档内容
import codecs

all_news=[]
category=[]
for file in file_list:
    with codecs.open(file,'r',encoding='utf-8') as news:
        all_news.append(news.read())
        category.append(file.split('\\')[-2])

#********* 步骤二 ********#
import jieba
import jieba.posseg as posseg

#导入停用词
## load stop words ##

stop_words_path=cwd+'\\stop_words.txt'
stop_words=set()
with open(stop_words_path,'r',encoding='utf-8') as sw:
    for line in sw.readlines():
        stop_words.add(line.strip())
stop_words.add('说')
stop_words.add('中')
stop_words.add('「')

#增加不在jiaba的新词
## load user dictionary ## 

jieba.load_userdict(cwd+'\\userdict_p.txt')

#分词,且只保留名词
## word segmentation ## 

word_seg_list=[]
for i in range(len(all_news)):
    word=[]
    line=all_news[i]
    result=posseg.cut(str(line))
    for w,s in result:
        if not w in stop_words and s[0]=='n':
            #print(w+'/'+s,end=' ')
            word.append(w)
    word_seg_list.append(' '.join(word))

#********* 步骤三 ********#
list1=[] #国际
list2=[] #娱乐
list3=[] #社会

for i in range(len(category)):
    if category[i]=='国际':
        list1.append(word_seg_list[i])
    elif category[i]=='娱乐':
        list2.append(word_seg_list[i])
    else: list3.append(word_seg_list[i])

# pip install wordcloud
from wordcloud import WordCloud,ImageColorGenerator
from collections import Counter
import matplotlib.pyplot as plt

terms=[]
for i in list1:
    terms.extend(i.split(' ')) #批量增加

my_wordcloud = WordCloud(background_color="white", font_path='msjh.ttf', max_words = 100, collocations=False, margin=2) 
my_wordcloud.generate_from_frequencies(Counter(terms))

plt.figure(figsize=(20,10),facecolor='k')
plt.imshow(my_wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

Python文本挖掘练习(四)// 词云图_第1张图片
Python文本挖掘练习(四)// 词云图_第2张图片

你可能感兴趣的:(Python)