文本挖掘HW3

import os
import os.path
import codecs
import pandas as pd
import numpy as np

filePaths = []
fileContents=[]
a=os.walk("C:/Users/dell/Desktop/datamining/2.1+语料库/2.1/SogouC.mini/Sample")
for root, dirs, files in a:
    for name in files:
        filePath=os.path.join(root,name)
        filePaths.append(filePath)
        f = codecs.open(filePath, 'r','utf-8')
        fileContent=f.read()
        f.close()
        fileContents.append(fileContent)

corpos = pd.DataFrame({'filePath': filePaths,'fileContent':fileContents})

corpos

segments=[]
filePaths =[]
for index, row in corpos.iterrows():
    filePath = row['filePath']
    fileContent = row['fileContent']
    segs = jieba.cut(fileContent)
    for seg in segs:
        segments.append(seg)
        filePaths.append(filePath)
segmentDataFrame = pd.DataFrame({'segment':segments,'filePath':filePaths})
segmentDataFrame
corpos.iterrows
segStat = segmentDataFrame.groupby(by='segment')["segment"].agg({"计数":np.size}).reset_index().sort_values('计数',ascending=False)

segmentDataFrame

我们发现存在jieba切分后有一些停用词在干扰,类似空格、标点以及一些中文中的介词助词等等。
所以,此时我们需要导入一个停用词库,停用词库中的词就不要放入切词数组中。

在这里需要注意的是,DataFrame没有sort这个属性,查了下,DataFrame有sort_values,
具体用法就是.sort_values('列名', ascending=False)

stopwords = pd.read_csv("C:\\Users\\dell\\Desktop\\datamining\\2.3\\StopwordsCN.txt",encoding='utf-8',index_col=False)

fSegStat = segStat[~segStat.segment.isin(stopwords.stopword)]

fSegStat

segments=[]
filePaths =[]
for index, row in corpos.iterrows():
    filePath = row['filePath']
    fileContent = row['fileContent']
    segs = jieba.cut(fileContent)
    for seg in segs:
        if seg not in stopwords.stopword.values and len(seg.strip())>1:
            segments.append(seg)
            filePaths.append(filePath)
segmentDataFrame = pd.DataFrame({'segment':segments,'filePath':filePaths})

词云图

地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
pip install wordcloud-1.4.1-cp36-cp36m-win_amd64.whl

segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'计数':np.size}).reset_index().sort_values('计数',ascending=False)
fSegStat = segStat[~segStat.segment.isin(stopwords.stopword)]
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud=WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\simhei.ttf',background_color='black')
words = fSegStat.set_index('segment').to_dict()
wordcloud.fit_words(words['计数'])
plt.imshow(wordcloud)
plt.show()

网上找了一篇有关燃料电池发展状况的分析报告:将文本转化为txt。

f = codecs.open('C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\fuelcell.txt', 'r', 'gbk')

txt=f.read()

txtcontent=jieba.cut(txt)

contents=[]

for content in txtcontent:
    if content not in stopwords.stopword.values and len(content.strip())>1:
        contents.append(content)

contentDataFrame=pd.DataFrame({'content':contents})

contentStat=contentDataFrame.groupby(by='content')['content'].agg({'计数':np.size}).reset_index().sort_values('计数',ascending=False)

wordcloud=WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\simhei.ttf',background_color='black')
words = contentStat.set_index('content').to_dict()
wordcloud.fit_words(words['计数'])
plt.imshow(wordcloud)
plt.show()

你可能感兴趣的:(文本挖掘HW3)