只是做个测试,最终聚类出来的信息实际应用意义不大。大家可以用sklearn中的兰花数据集进行测试,。
我的数据集样式如下:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
x_sky_train = pd.read_csv('datadatadata1.csv',usecols = [2,3],header=None)
#将读取的数据转化为numpy数组
x_sky_train1 = x_sky_train.values
print(x_sky_train1)
#print(x_sky_train)
print(np.shape(x_sky_train1))
#构造聚类器
clf = KMeans(n_clusters=6)
#开始聚类
y_sky_train = clf.fit_predict(x_sky_train1)
#获得聚类标签
label_clf = clf.labels_
print(label_clf)
#获得聚类中心,保存在df_center的DataFrame中给数据加上标签
center = clf.cluster_centers_
df_center = pd.DataFrame(center, columns=['x', 'y'])
df = pd.DataFrame(x_sky_train1, index=label_clf, columns=['x', 'y'])
df1 = df[df.index==0]
df2 = df[df.index==1]
df3 = df[df.index==2]
df4 = df[df.index==3]
df5 = df[df.index==4]
df6 = df[df.index==5]
plt.figure(figsize=(10,8), dpi=80)
axes = plt.subplot()
type1 = axes.scatter(df1.loc[:,['x']], df1.loc[:,['y']], s=50, c='red', marker='d')
type2 = axes.scatter(df2.loc[:,['x']], df2.loc[:,['y']], s=50, c='green', marker='*')
type3 = axes.scatter(df3.loc[:,['x']], df3.loc[:,['y']], s=50, c='brown', marker='p')
type4 = axes.scatter(df4.loc[:,['x']], df4.loc[:,['y']], s=50, c='black')
type5 = axes.scatter(df5.loc[:,['x']], df5.loc[:,['y']], s=50, c='yellow')
type6 = axes.scatter(df6.loc[:,['x']], df6.loc[:,['y']], s=50, c='purple')
type_center = axes.scatter(df_center.loc[:,'x'], df_center.loc[:,'y'], s=40, c='blue')
plt.xlabel('x', fontsize=16)
plt.ylabel('y', fontsize=16)
axes.legend((type1, type2, type3,type4,type5,type6, type_center), ('0','1','2','4','5','6','center'), loc=1)
plt.show()
可视化结果: