python 可视化 词云图

 

 

文本挖掘及可视化知识链接

python 可视化 词云图_第1张图片

我的代码:

 

# -*- coding: utf-8 -*-
from pandas import read_csv
import numpy as np
from sklearn.datasets.base import Bunch
import pickle  # 导入cPickle包并且取一个别名pickle #持久化类
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
import operator  # 排序用
from sklearn import metrics
from sklearn.externals import joblib
import xlwt
#导入wordcloud模块和matplotlib模块
import wordcloud
import matplotlib.pyplot as plt
from scipy.misc import imread

        
'''读取停用词'''
def _readfile(path):
    with open(path, "rb") as fp:
        content = fp.read()
    return content


''' 读取bunch对象'''


def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch


'''写入bunch对象'''


def _writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)

def buildtestbunch(bunch_path, art_test):
    bunch = Bunch(contents=[])#label=[],
# =============================================================================
#     for item1 in testlabel:
#         bunch.label.append(item1)
# =============================================================================

    # testContentdatasave=[] #存储所有训练和测试数据的分词
    for item2 in art_test:
        item2 = str(item2)
        item2 = item2.replace("\r\n", "")
        item2 = item2.replace(" ", "")
        content_seg = jieba.cut(item2)
        save2 = ''
        for item3 in content_seg:
            if len(item3) > 1 and item3 != '\r\n':
                # testContentdatasave.append(item3)
                save2 = save2 + "," + item3
        bunch.contents.append(save2)
    with open(bunch_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("构建测试数据文本对象结束!!!")


def vector_space(stopword_path, bunch_path, space_path):
    stpwrdlst = _readfile(stopword_path).splitlines()  # 读取停用词
    bunch = _readbunchobj(bunch_path)  # 导入分词后的词向量bunch对象
    # 构建tf-idf词向量空间对象
    tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={})

    # 权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值

    # 使用TfidVectorizer初始化向量空间模型
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True,
                                 max_features=15000)
    # print(vectorizer)
    # 文本转为词频矩阵,单独保存字典文件
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
    # 创建词袋的持久化
    _writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功!!!")


def testvector_space(stopword_path, bunch_path, space_path, train_tfidf_path):
    stpwrdlst = _readfile(stopword_path).splitlines()  # 把停用词变成列表
    bunch = _readbunchobj(bunch_path)
    tfidfspace = Bunch(tdm=[], vocabulary={})#label=bunch.label, 
    # 导入训练集的TF-IDF词向量空间  ★★
    trainbunch = _readbunchobj(train_tfidf_path)
    tfidfspace.vocabulary = trainbunch.vocabulary

    vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary,
                                 min_df=0.001)
     
    
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    _writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功!!!")

if __name__=="__main__":  
    
    
    Sdata = []
    art = []
    '''============================先导入数据=================================='''
    file_test = 'F:/goverment/text analyse/type_in.csv'

    dataset = read_csv(file_test)
    Sdata = dataset.values[:, :]
    Sdata=Sdata.tolist()
    for line in Sdata:
        art.append(line[1])#line[1]为文本
    print(len(Sdata))
    
    '''==========================================================tf-idf对Bar进行文本特征提取============================================================================'''
    # 导入分词后的词向量bunch对象
    test_bunch_path = "F:/goverment/text analyse/trainbunch.bat"
    test_space_path = "F:/goverment/text analyse/traintfdifspace.dat"
    stopword_path = "F:/goverment/text analyse/hlt_stop_words.txt"

    '''============================================================tf-idf对Sart进行文本特征提取=============================================================================='''

    buildtestbunch(test_bunch_path, art)

    testvector_space(stopword_path, test_bunch_path, test_space_path, test_space_path)

    test_set = _readbunchobj(test_space_path)

    '''测试数据'''

    
    #获取已知 id 找 文本
    txtcut=[] #存放所有词
    dic={}
    for i in test_set.vocabulary.keys():
        txtcut.append(i)
        dic[test_set.vocabulary[i]]=i
        

    #print(dic)
    
    #print(test_set.tdm)
    #print(test_set.tdm[0])
    #print(dir(test_set))
    #print(test_set.vocabulary)
    #print(dir(test_set.tdm))
    
    #print(Sdata)
    
    #print(nonzero[1])
  
    '''final里放的是不超过15的词'''
    #print(Sdata)
    final=[]
    for k in range(len(Sdata)):#遍历每一条文本
        nonzero=test_set.tdm[k].nonzero()
        ls=[]
        ls.append(Sdata[k][0])
        num=0
        for i in range(len(nonzero[1])):
            num=num+1
            b=test_set.tdm[k, nonzero[1][i]]*100 #test_set.tdm[k, nonzero[1][i]]是第k条文本中,第i个权重非零的词权重
            a= dic[nonzero[1][i]] +"  "+str(round(b,2))+"%"
            ls.append(a)
            if num==15:
                break
        final.append(ls)
    
    '''画词云图'''
    fig = plt.figure(figsize = (15,15))
    cloud = wordcloud.WordCloud(font_path='STXINGKA.TTF',mask=imread('water3.png'),mode='RGBA',
                                background_color=None).generate(' '.join(txtcut))
    img = imread('water3.png')
    cloud_colors = wordcloud.ImageColorGenerator(np.array(img))
    cloud.recolor(color_func=cloud_colors)
    plt.imshow(cloud)
    plt.axis('off')
    plt.savefig('watercloud3.png',dpi=400)
    plt.show()
    
    myexcel = xlwt.Workbook()
    sheet = myexcel.add_sheet("sheet1")
    si=-1
    sj=-1
    for line in final:
        si=si+1
        sj=-1
        for i in line:
            sj=sj+1
            sheet.write(si,sj,str(i))
    
    myexcel.save("各条分词.xls")
    
    
    #把id存好
    myexcel = xlwt.Workbook()
    sheet = myexcel.add_sheet("sheet2")
    p=0
    for i in test_set.vocabulary.keys():
        sheet.write(p,0,i)
        print(i)
        sheet.write(p,1,str(test_set.vocabulary[i]))
        p=p+1
            
    myexcel.save("词汇id.xls")

 

 各条分词:

python 可视化 词云图_第2张图片

 

 

 词汇id:

python 可视化 词云图_第3张图片

 

你可能感兴趣的:(python 可视化 词云图)