python实现文本词频统计分析,计算距离重心和词云可视化

在这里插入图片描述
import jieba
import math
import wordcloud
import matplotlib.pyplot as plt

#构建停用词列表
def stopword(path1):
    file = open(path1,'r',encoding='utf-8')
    stopwords = [line.strip() for line in file.readlines()]

    return stopwords
    
#统计词频和高频词
def comment(path2,stopwords):
    dic = {}
    with open(path2,'r',encoding='utf-8') as f:
        txt = f.readlines()
        for line in txt:
            word = jieba.lcut(line.strip('\n'))
            for w in word:
                if len(w)>1 and w not in stopwords:
                    dic[w] = dic.get(w,0) + 1
    dic_order=sorted(dic.items(),key=lambda x:x[1],reverse=True) 
    print(dic_order)
    lis = [k[0] for k in dic_order[:6]]
    print(lis)

    return lis,dic

#查看每条评论(文档)是否含有高频词
def matrics(path2,lis):
    matric = []
    with open(path2,'r',encoding='utf-8') as f:
        txt = f.readlines()
        for line in txt:
            mat = [0 for i in range(6)]
            word = jieba.lcut(line.strip('\n'))
            #print(word)
            for w in word:
                if w in lis:
                    #print(w,end = ' ')
                    mat[lis.index(w)] = 1 #变成+=1就是累加
            matric.append(mat)
            #print()
    return matric

#计算不同评论之间的距离
def distance(matric):
    for i in matric:
        for j in matric:
            var = math.sqrt(sum([(i[k]-j[k])**2 for k in range(6)]))
            #print(round(var,2),end =' ')
        #print()

#计算所有评论的重心并输出
def center(matric):
    point = [0 for i in range(6)]
    for i in matric:
        for j in range(6):
            point[j] += i[j]/len(matric)
    print('point = ',point)

#绘制词云
def wcloud(dic):
    wc = wordcloud.WordCloud( #     根据词频字典生成词云图
        max_words=200,  # 最多显示词数
        max_font_size=300,  # 字体最大值
        background_color="white",  # 设置背景为白色,默认为黑色
        width = 1500,  # 设置图片的宽度
        height= 960,  # 设置图片的高度
        margin= 10,  # 设置图片的边缘
        font_path='C:/Windows/SIMLI.TTF'
    )
    wc.generate_from_frequencies(dic)  # 从字典生成词云
    plt.imshow(wc)  # 显示词云
    plt.axis('off')  # 关闭坐标轴
    plt.show()  # 显示图像
    #wc.to_file(fp)  # 保存图片

#定义主函数
def main():
    path1 = 'stopwords_list.txt'
    path2 = 'jd_comments.txt'
    stopwords = stopword(path1)
    lis,dic = comment(path2,stopwords)
    matric = matrics(path2,lis)
    distance(matric)
    center(matric)
    wcloud(dic)

#主函数调用执行
if __name__ == '__main__':
    main()

部分高频词统计

[('不错', 451), ('电脑', 332), ('非常', 297), ('没有', 236), ('速度', 234), ('客服', 187), ('问题', 186), ('开机', 183), ('京东', 180), ('满意', 176), ('感觉', 166), ('很快', 149), ('收到', 147), ('东西', 135), ('系统', 133), ('真的', 128), ('使用', 126), ('键盘', 118), ('喜欢', 115), ('包装', 113), ('有点', 111), ('比较', 105), ('hellip', 105), ('外观
', 104), ('游戏', 101), ('效果', 101), ('硬盘', 100), ('物流', 98), ('价格', 97), ('快递', 97), ('屏幕', 97), ('性能', 93), ('机器', 88), ('流畅', 88), ('性价比', 87), ('一下', 87), ('运行', 85), ('安装', 84), ('购买', 82), ('评价', 82), ('值得', 81), ('鼠标', 80), ('方便', 78), ('不是', 78), ('一次', 78), ('知道', 77), ('配置', 77), ('总体', 77), ('耐
心', 76), ('卖家', 74), ('好评', 74), ('现在', 73), ('笔记本', 73), ('特别', 73), ('推荐', 69), ('第一次', 68), ('希望
', 67), ('来说', 65), ('一点', 65), ('服务', 64), ('质量', 64), ('固态', 63), ('内存', 63), ('很多', 63), ('已经', 62), ('软件', 62), ('清晰', 61), ('以后', 60), ('机子', 60), ('办公', 59), ('声音', 58), ('购物', 58), ('需要', 58), ('几
天', 58), ('之前', 57), ('发货', 53), ('朋友', 53), ('打开', 53), ('散热', 53), ('店家', 52), ('态度', 52), ('完美', 52), ('觉得', 52), ('最后', 51), ('做工', 51), ('好看', 50), ('支持', 50), ('玩游戏', 50),

特征词组成的特征集

['不错', '电脑', '非常', '没有', '速度', '客服']

部分评论的“坐标”

[1, 0, 0, 0, 1, 0]
[1, 1, 1, 0, 0, 0]
[1, 0, 1, 1, 1, 0]
[0, 1, 0, 0, 0, 0]
[0, 1, 1, 0, 0, 1]
[1, 1, 0, 0, 1, 1]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 0, 0]
[1, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 1]
[1, 0, 0, 1, 0, 0]
[0, 1, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0]
[1, 0, 1, 1, 0, 0]
[0, 0, 0, 0, 1, 0]
[1, 1, 0, 0, 1, 0]
[0, 1, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 1]
[0, 1, 0, 1, 0, 1]
[0, 0, 0, 0, 0, 0]
[0, 1, 1, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 1, 1, 0, 0, 0]
[0, 0, 0, 0, 1, 0]
[0, 1, 1, 1, 0, 0]
[1, 1, 1, 0, 0, 0]
[0, 1, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0]
[1, 1, 0, 1, 0, 0]
[1, 0, 0, 0, 1, 0]
[1, 0, 0, 1, 1, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 0, 0]
[0, 0, 1, 0, 0, 0]
[0, 0, 0, 0, 1, 0]
[0, 0, 0, 1, 1, 0]

所有评论的“重心”

point =  [0.3383233532934152, 0.23253493013972026, 0.19960079840319336, 0.19161676646706563, 0.19161676646706563, 0.1447105788423152]

词云可视化

在这里插入图片描述

你可能感兴趣的:(python实现文本词频统计分析,计算距离重心和词云可视化)