天涯论坛——python可视化(kmeans聚类+情感分析)(三)

通过使用pyecharts实现数据分析

数据下载:https://download.csdn.net/download/weixin_43906500/14141832

使用的情感分析库:https://github.com/hellonlp/sentiment_analysis_dict

1.情感分析

代码如下

# 导入SnowNLP库
from snownlp import SnowNLP
import json
import pandas as pd
import jieba
from sentiment_analysis_dict.networks import SentimentAnalysis
from pyecharts import Pie

SA = SentimentAnalysis()

def predict(sent):
    """
    1: positif
    0: neutral
    -1: negatif
    """
    result = 0
    score1,score0 = SA.normalization_score(sent)
    if score1 == score0:
        result = 0
    elif score1 > score0:
        result = 1
    elif score1 < score0:
        result = -1
    return result

def getscore(text):
    df = pd.read_table(r"BosonNLP.txt", sep=" ", names=['key', 'score'])
    key = df['key'].values.tolist()
    score = df['score'].values.tolist()
    # jieba分词
    segs = jieba.lcut(text, cut_all=False)  # 返回list
    # 计算得分
    score_list = [score[key.index(x)] for x in segs if (x in key)]
    return sum(score_list)

with open('data.json','r',encoding='utf-8') as load_f:
    load_dict = json.load(load_f)
positive = 0
negative = 0
neutral = 0
for dic in load_dict:
    comments_replys = dic['comments_replys']
    if comments_replys is not None:
        for d in comments_replys:
            try:
                if d['comment']:
                    for k,v in d['comment'].items():
                        n = predict(v)
                        if(n > 0):
                            positive = positive + 1
                        elif(n==0):
                            neutral = neutral + 1
                        else:
                            negative = negative + 1
                if d['reply']:
                    for k,v in d['reply'].items():
                        n = predict(v)
                        if (n > 0):
                            positive = positive + 1
                        elif (n == 0):
                            neutral = neutral + 1
                        else:
                            negative = negative + 1
            except:
                continue

attr = ["积极", "中性", "悲观"]
v = [positive,neutral,negative]
pie = Pie("情感分析饼图", title_pos='center', width=800)
pie.add("情感", attr, v, center=[50, 50], is_random=True, radius=[30, 75], rosetype='area',
        is_legend_show=False, is_label_show=True)
pie.render("7.emotion_analysis.html")

效果展示

天涯论坛——python可视化(kmeans聚类+情感分析)(三)_第1张图片

2主题分类

1.进行分词

2.去除停用词

3.构建词向量空间

4.计算tf-idf值

5.用kmeans方法进行聚类

6.使用pyecharts库展示

import json
import jieba
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Tree

stopwords = [line.strip() for line in open("stopword.txt", 'r',encoding="utf-8").readlines()]
stopwords_other = ['\n',' ']
stopwords.extend(stopwords_other)

with open("data.json",'r',encoding='utf-8') as load_f:
    load_dict = json.load(load_f)
title_list = []
title_word_list = set()
for dic in load_dict:
    title = dic['title']
    if title in title_list:
        continue
    word_cut = jieba.lcut(title)
    word_cut = [i for i in word_cut if i not in stopwords]
    title_word_list |= set(word_cut)
    title_list.append(title)

title_list = list(set(title_list))
title_word_list = list(title_word_list)
vsm_list = []
for title in title_list:
    temp_vector = []
    # print(title)
    for word in title_word_list:
        temp_vector.append(title.count(word)*1.0)
    vsm_list.append(temp_vector)


docs_matrix = np.array(vsm_list)
column_sum = [float(len(np.nonzero(docs_matrix[:,i])[0])) for i in range(docs_matrix.shape[1])]
column_sum = np.array(column_sum)
column_sum = docs_matrix.shape[0] / column_sum
idf = np.log(column_sum)
idf = np.diag(idf)
tfidf = np.dot(docs_matrix,idf)


clf = KMeans(n_clusters=20)
clf.fit(tfidf)
centers = clf.cluster_centers_
labels = clf.labels_
# print(labels)
list_divide = {}
for i in range(len(labels)):
    id = str(labels[i])
    if id in list_divide.keys():
        list_divide[id].append(title_list[i])
    else:
        list_divide[id] = []
# print(list_divide)

list_divide_plus = {}
word_used = []
key_word = []
for k,v in list_divide.items():
    word = []
    for i in v:
        word.extend(jieba.lcut(i))
    word = [i for i in word if i not in stopwords]
    word_count = Counter(word)
    word = word_count.most_common(20)
    for i in word:
        if(i[0] not in word_used):
            key_word.append(i[0])
            list_divide_plus[i[0]] = v
            word_used.append(i[0])
            break

dic_data = []
dic_tree = {}
dic_tree["name"] = "主题树"
tree_list = []
for k,v in list_divide_plus.items():
    dic = {}
    dic["name"] = k
    list = []
    for v1 in v:
        dic1 = {}
        dic1["name"] = v1
        list.append(dic1)
    dic["children"] = list
    tree_list.append(dic)
dic_tree["children"] = tree_list
dic_data.append(dic_tree)
# print(dic_data)
c = (
    Tree()
    .add("", dic_data)
    .set_global_opts(title_opts=opts.TitleOpts(title="主题树展示"))
    .render("8.subject_classification.html")
)

效果展示

天涯论坛——python可视化(kmeans聚类+情感分析)(三)_第2张图片

参考链接:https://www.cnblogs.com/bincoding/p/8878098.html

你可能感兴趣的:(态势感知)