import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
data_path = r'E:\研一上\数据挖掘\dataset\Case2-clustering\ALS.csv'
eps = 1e-8
def preprocess(data_path):
df = pd.read_csv(data_path)
data = df.iloc[:,1:]
data_norm = data.apply(lambda x:(x-np.mean(x))/np.std(x)+eps)
return data_norm, data
def myplot(data_path):
data_norm, data = preprocess(data_path)
for i in range(8):
plt.subplot(2,4,i+1)
plt.hist(data_norm.iloc[:,i].values,color='m')
plt.title(data_norm.columns[i])
plt.show()
def definek(data_path):
K = range(20)
data_norm, data = preprocess(data_path)
data_norm_values = data.values.astype('float64')
distance = []
for i in K:
model = KMeans(n_clusters=i+1,random_state=10)
model.fit(data_norm_values)
centers = model.cluster_centers_
distance.append(np.mean(np.min(cdist(data_norm_values,centers),axis=1)))
plt.plot(K, distance)
plt.title('Distance with different clusters')
plt.xlabel('clusters')
plt.ylabel('Distance')
plt.show()
def vis(data_path):
data_norm, data = preprocess(data_path)
data_norm_values = data.values.astype('float64')
model = KMeans(n_clusters=5, random_state=10)
model.fit(data_norm_values)
label = model.predict(data_norm_values)
column = list(data.columns)
column.append('label')
label = np.reshape(label,(len(label),1))
result = np.concatenate((data_norm_values,label),axis=1)
result_df = pd.DataFrame(result,columns=column)
result_df.to_csv('./result.csv')
centers = model.cluster_centers_
tsne = TSNE(n_components=2,perplexity=30.0,
early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
n_iter_without_progress=300, min_grad_norm=1e-7,
metric="euclidean", init="random", verbose=0,
random_state=None, method='barnes_hut', angle=0.5)
low_dims_data = tsne.fit_transform(data_norm_values)
low_dims_center = tsne.fit_transform(centers)
data_vis = np.concatenate((low_dims_data, label), axis=1)
data_df = pd.DataFrame(data_vis,columns=['x', 'y', 'label'])
class_one = data_df[data_df['label'] == 0].values
class_two = data_df[data_df['label'] == 1].values
class_three = data_df[data_df['label'] == 2].values
class_four = data_df[data_df['label'] == 3].values
class_five = data_df[data_df['label'] == 4].values
plt.scatter(class_one[:,0],class_one[:,1],c='g')
plt.scatter(class_two[:,0],class_two[:,1],c='r')
plt.scatter(class_three[:,0],class_three[:,1],c='b')
plt.scatter(class_four[:,0],class_four[:,1],c='y')
plt.scatter(class_five[:,0],class_five[:,1],c='m')
plt.show()
def vis_triples(data_path):
data_norm, data = preprocess(data_path)
data_norm_values = data.values.astype('float64')
model = KMeans(n_clusters=5, random_state=10)
model.fit(data_norm_values)
label = model.predict(data_norm_values)
label = np.reshape(label,(len(label),1))
centers = model.cluster_centers_
tsne = TSNE(n_components=3,perplexity=30.0,
early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
n_iter_without_progress=300, min_grad_norm=1e-7,
metric="euclidean", init="random", verbose=0,
random_state=None, method='barnes_hut', angle=0.5)
low_dims_data = tsne.fit_transform(data_norm_values)
data_vis = np.concatenate((low_dims_data, label), axis=1)
data_df = pd.DataFrame(data_vis,columns=['x', 'y', 'z','label'])
class_one = data_df[data_df['label'] == 0].values
class_two = data_df[data_df['label'] == 1].values
class_three = data_df[data_df['label'] == 2].values
class_four = data_df[data_df['label'] == 3].values
class_five = data_df[data_df['label'] == 4].values
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(class_one[:,0],class_one[:,1],class_one[:,2],c='g')
ax.scatter(class_two[:,0],class_two[:,1],class_two[:,2],c='r')
ax.scatter(class_three[:,0],class_three[:,1],class_three[:,2],c='b')
ax.scatter(class_four[:,0],class_four[:,1],class_four[:,2],c='y')
ax.scatter(class_five[:,0],class_five[:,1],class_five[:,2],c='m')
ax.set_xlabel('feature1')
ax.set_ylabel('feature2')
ax.set_zlabel('feature3')
plt.show()
def main():
vis_triples(data_path)
if __name__ == '__main__':
main()