K-Means聚类又叫做K均值聚类,即将n个样本分到k个类中,每个样本到其所属类的中心的距离最小。由于每个样本只能属于一个类,因此也是属于一种硬聚类。
基本原理:
由于K-Means聚类,他的聚类中心个数是需要我自己规定的,那么带来的问题就很明显,k取多少比较合适呢?
实际上聚类算法没有绝对的好坏之分吧,如果你有极强的解释能力,聚的再烂你也能圆回来的话,那么问题也不大。
确定k值有一种肘部法,即SSE突变的位置效果为最好。
下面看一个例子:
data.csv的内容:
0,1
15.488792,-17.931486
14.009334,-17.461472
5.0456505,-80.11763
8.255032,-83.60655
16.831146,-18.429255
17.170382,-18.649935
15.638589,-17.82137
10.48919,-66.737976
9.43915,-66.58489
-42.065083,11.670583
-42.302345,11.1598835
-35.42476,7.4434624
2.0027802,-62.947144
-9.558702,-41.658405
-6.644909,-45.406414
2.089279,-64.92243
3.023925,-64.24094
-11.504891,-73.93064
1.8251665,-65.87515
-31.574617,10.575175
-1.9865129,-54.880627
-3.4899065,-54.643036
-3.56678,-54.239433
0.03461591,-56.10578
1.5199226,-63.042603
5.713296,-42.58916
6.0714397,-42.425
-2.56875,-51.994877
-2.563768,-52.04165
-2.4377584,-58.506905
1.3156503,-62.608376
2.5668623,-28.314314
1.1977189,-62.42471
-33.774075,8.584474
-33.503384,8.835019
2.8770874,-41.44645
-36.496128,9.367835
17.112753,-18.686089
15.90513,-20.412813
18.882956,-19.273212
18.527239,-19.111153
-20.953918,30.682774
18.72682,-19.19712
16.825476,-55.060707
18.823103,-46.734413
11.325681,-81.20977
11.295984,-81.171104
11.283557,-81.25094
11.279441,-81.22319
19.669847,-45.90381
0.17542204,-80.0895
-37.171318,5.7718396
-35.158585,-2.4300306
20.338614,-47.541668
9.947348,-67.96679
9.938391,-67.81397
9.005385,-66.28152
11.729049,-80.80544
9.692903,-67.31649
-1.116502,-24.230738
-0.15937212,-80.23866
-43.810703,14.1585
1.2338688,-84.449165
9.824546,-68.91204
9.727886,-68.69063
-10.101936,-75.58777
9.2168045,-67.703804
9.4143505,-67.46319
9.20316,-67.201546
9.123886,-67.64245
8.974873,-67.158226
-0.9675927,-24.311337
10.20117,-50.293575
8.864632,-25.913994
10.06935,-50.544205
-40.531334,5.1215134
-38.3622,-0.77653354
9.004171,-66.79319
9.107706,-66.94209
7.3415074,-25.715435
7.411956,-25.679813
-36.75561,5.7931013
19.30113,-51.324512
19.374086,-51.258568
-14.826523,-48.112396
-43.817333,14.168898
-1.1965861,-24.021088
-36.61707,5.7337785
-36.58713,5.024142
7.382329,-46.25685
7.2945113,-46.41526
7.2977777,-51.57637
5.9467063,-55.889668
5.264757,-66.039604
7.181303,-46.678116
-39.441395,-46.049232
-33.796097,-60.633823
-37.712524,-41.753
-38.05651,-44.625904
python代码
# coding: utf-8
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
def KMean_graph(): # KMeans聚类
pd_down_data = pd.read_csv("data.csv")
down_data = np.array(pd_down_data)
SSE = []
left = 2
right = 16
for k in range(left, right):
km = KMeans(n_clusters=k)
km.fit(down_data)
SSE.append(km.inertia_)
# 肘部法取k值
xx = range(left, right)
plt.xlabel("k")
plt.ylabel("SSE")
plt.plot(xx, SSE, "o-")
plt.savefig("SSE.png")
plt.show()
D_data = pd_down_data
weidu = 2
km = KMeans(n_clusters=5).fit(down_data) # 确定完取几类之后开始计算
print("质心")
center = km.cluster_centers_
print(center)
D_data["cluster"] = km.labels_ # 给数据打上聚类标签
print(D_data)
plt.rcParams["font.size"] = 14
colors = np.array(["red", "gray", "orange", "pink", "blue", "black"])
new_col = [] # 更换列名,但是由于我用的tsne降维出来的,就懒得取名字了
for i in range(weidu):
new_col.append(str(i))
new_col.append("cluster")
D_data.columns = new_col
ele_x = "0" # Dataframe第一列的列名
ele_y = "1" # 第二列列名
xx = np.array(D_data[ele_x])
yy = np.array(D_data[ele_y])
cc = np.array(D_data["cluster"])
plt.scatter(xx, yy, c=colors[cc]) # 不同的类用上不同的颜色
plt.scatter(center[:, 0], center[:, 1], marker="o", s=15, c="black") # 聚类中心
plt.xlabel(ele_x)
plt.ylabel(ele_y)
plt.savefig("K-means.png")
plt.show()
def main():
KMean_graph()
if __name__ == '__main__':
main()
大体也看得出是5类吧。