文本聚类

# -*- coding: utf-8 -*-
# @Time    : 2019/11/1 13:23
# @Author  : Chicker
# @FileName: clusterr.py
# @Software: PyCharm
# @Blog    :http://blog.csdn.net/u010105243/article/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

data_path = r'E:\研一上\数据挖掘\dataset\Case2-clustering\ALS.csv'
eps = 1e-8

def preprocess(data_path):
    df = pd.read_csv(data_path)
    data = df.iloc[:,1:]
    data_norm = data.apply(lambda x:(x-np.mean(x))/np.std(x)+eps)
    return data_norm, data

def myplot(data_path):
    data_norm, data = preprocess(data_path)

    for i in range(8):
        plt.subplot(2,4,i+1)
        plt.hist(data_norm.iloc[:,i].values,color='m')
        plt.title(data_norm.columns[i])
    plt.show()

def definek(data_path):
    K = range(20)
    data_norm, data = preprocess(data_path)
    data_norm_values = data.values.astype('float64')
    distance = []
    for i in K:
        model = KMeans(n_clusters=i+1,random_state=10)
        model.fit(data_norm_values)
        centers = model.cluster_centers_
        distance.append(np.mean(np.min(cdist(data_norm_values,centers),axis=1)))
    plt.plot(K, distance)
    plt.title('Distance with different clusters')
    plt.xlabel('clusters')
    plt.ylabel('Distance')
    plt.show()

def vis(data_path):
    data_norm, data = preprocess(data_path)
    data_norm_values = data.values.astype('float64')
    model = KMeans(n_clusters=5, random_state=10)
    model.fit(data_norm_values)
    label = model.predict(data_norm_values)
    column = list(data.columns)
    column.append('label')
    label = np.reshape(label,(len(label),1))
    result = np.concatenate((data_norm_values,label),axis=1)
    result_df = pd.DataFrame(result,columns=column)
    result_df.to_csv('./result.csv')
    centers = model.cluster_centers_
    # print(centers)
    tsne = TSNE(n_components=2,perplexity=30.0,
                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
                 n_iter_without_progress=300, min_grad_norm=1e-7,
                 metric="euclidean", init="random", verbose=0,
                 random_state=None, method='barnes_hut', angle=0.5)
    low_dims_data = tsne.fit_transform(data_norm_values)
    low_dims_center = tsne.fit_transform(centers)
    # print(low_dims_center)
    data_vis = np.concatenate((low_dims_data, label), axis=1)
    data_df = pd.DataFrame(data_vis,columns=['x', 'y', 'label'])
    class_one = data_df[data_df['label'] == 0].values
    class_two = data_df[data_df['label'] == 1].values
    class_three = data_df[data_df['label'] == 2].values
    class_four = data_df[data_df['label'] == 3].values
    class_five = data_df[data_df['label'] == 4].values
    plt.scatter(class_one[:,0],class_one[:,1],c='g')
    plt.scatter(class_two[:,0],class_two[:,1],c='r')
    plt.scatter(class_three[:,0],class_three[:,1],c='b')
    plt.scatter(class_four[:,0],class_four[:,1],c='y')
    plt.scatter(class_five[:,0],class_five[:,1],c='m')
    # plt.scatter(low_dims_center[0][0],low_dims_center[0][1],marker='o')
    # plt.scatter(low_dims_center[1][0],low_dims_center[1][1],marker='s')
    # plt.scatter(low_dims_center[2][0],low_dims_center[2][1],marker='p')
    # plt.scatter(low_dims_center[3][0],low_dims_center[3][1],marker='*')
    # plt.scatter(low_dims_center[4][0],low_dims_center[4][1],marker='x')
    plt.show()

def vis_triples(data_path):
    data_norm, data = preprocess(data_path)
    data_norm_values = data.values.astype('float64')
    model = KMeans(n_clusters=5, random_state=10)
    model.fit(data_norm_values)
    label = model.predict(data_norm_values)
    # column = list(data.columns)
    # column.append('label')
    label = np.reshape(label,(len(label),1))
    # result = np.concatenate((data_norm_values,label),axis=1)
    # result_df = pd.DataFrame(result,columns=column)
    # result_df.to_csv('./result.csv')
    centers = model.cluster_centers_
    # print(centers)
    tsne = TSNE(n_components=3,perplexity=30.0,
                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
                 n_iter_without_progress=300, min_grad_norm=1e-7,
                 metric="euclidean", init="random", verbose=0,
                 random_state=None, method='barnes_hut', angle=0.5)
    low_dims_data = tsne.fit_transform(data_norm_values)
    # low_dims_center = tsne.fit_transform(centers)
    # print(low_dims_center)
    data_vis = np.concatenate((low_dims_data, label), axis=1)
    data_df = pd.DataFrame(data_vis,columns=['x', 'y', 'z','label'])
    class_one = data_df[data_df['label'] == 0].values
    class_two = data_df[data_df['label'] == 1].values
    class_three = data_df[data_df['label'] == 2].values
    class_four = data_df[data_df['label'] == 3].values
    class_five = data_df[data_df['label'] == 4].values
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(class_one[:,0],class_one[:,1],class_one[:,2],c='g')
    ax.scatter(class_two[:,0],class_two[:,1],class_two[:,2],c='r')
    ax.scatter(class_three[:,0],class_three[:,1],class_three[:,2],c='b')
    ax.scatter(class_four[:,0],class_four[:,1],class_four[:,2],c='y')
    ax.scatter(class_five[:,0],class_five[:,1],class_five[:,2],c='m')
    ax.set_xlabel('feature1')
    ax.set_ylabel('feature2')
    ax.set_zlabel('feature3')
    plt.show()

def main():
    vis_triples(data_path)


if __name__ == '__main__':
    main()

你可能感兴趣的:(文本聚类)