python(1)-------绘制词云图

# -*- coding: utf-8 -*-

import re
import jieba
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer

class DrawWordCloud(object):
    _apBranchWordCol = '文本分词结果'

    def __init__(self, path,colname):
        self._path = path
        self._colname = colname

    # 分词方法
    def branchword(self):
        idata = pd.read_excel(self._path)
        # 过滤非中文字符\w
        idata['过滤非中文字符'] = idata[self._colname].astype(str).map(lambda x: re.sub('[^\u4e00-\u9fff]+', '', x))
        idata[self._apBranchWordCol] = ""
        idata[self._apBranchWordCol] = idata['过滤非中文字符'].map(lambda x: ' '.join(jieba.cut(x, cut_all=False)))
        return idata

    # 词频统计
    def wordfreqcount(self,df_brachword):
        _txts = []
        for index, row in df_brachword.loc[:, [self._apBranchWordCol]].iterrows():
            try:
                _txt = ' '.join(str(i) for i in row.values)
                _txt = _txt.replace('nan', '').strip();
                if _txt != '':
                    _txts.append(_txt);
            except:
                pass

        # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
        vectorizer = CountVectorizer()
        # # 计算个词语出现的次数
        X = vectorizer.fit_transform(_txts)
        # # 获取词袋模型中的所有词语
        # word = vectorizer.get_feature_names()
        # # 查看词频结果
        # X_array = X.toarray()
        # resData = pd.DataFrame(X_array, columns=word)
        wordFrequencyCount = vectorizer.vocabulary_.items()  # 词频统计
        res_df=pd.DataFrame(wordFrequencyCount,columns=["列名","词频"])
        return res_df

    def wordcloud(self, fp):
        name = list(fp.loc[:,"列名"])  # 词
        value = fp.loc[:,"词频"]  # 词的频率
        da = dict(zip(name, value))  # 词频以字典形式存储
        pic = plt.imread('data/2.jpg')
        wc = WordCloud(mask=pic, font_path="data/simfang.ttf", max_words=80, background_color='White')
        wc.generate_from_frequencies(da)  # 从字典生成词云
        plt.figure(figsize=(8, 8))
        plt.imshow(wc)
        plt.show()

def main():
    path="data/母婴22097.xlsx"
    colname="名称"
    drawwordcloud = DrawWordCloud(path,colname)
    # 分词
    res_braword=drawwordcloud.branchword()
    # 词频统计
    res_freqword=drawwordcloud.wordfreqcount(res_braword) 
    # 绘制词云图
    drawwordcloud.wordcloud(res_freqword)

if __name__ == "__main__":
    main()

 

你可能感兴趣的:(python,python)