'''
# author: 影子
# datetime: 2021-05-18 12:10
# ide: PyCharm
target : 完成一个简单的KMeans聚类算法
'''
from numpy import *
def Euclidean_Distance(vecA,vecB):
return sqrt(sum(power(vecA-vecB,2)))
def Rand_Cent(DataSet,k):
n = shape(DataSet)[1]
centroids = mat(zeros([k,n]))
for j in range(n):
minj = min(DataSet[:,j])
rangej = float(max(DataSet[:,j])-minj)
centroids[:,j] = mat(minj+rangej*random.rand(k,1))
return centroids
def kMeans(DataSet,k,Dist=Euclidean_Distance,set_cent=Rand_Cent):
'''
:param DataSet: 需要聚类的数据集
:param k: 聚类数目
:param Dist: 距离函数,此处调用了欧式距离函数
:param set_cent: 初始质心函数,此处调用随机生成的质心函数
:return: 质心点;数据对应的聚类簇
'''
m = shape(DataSet)[0]
centoids = set_cent(DataSet,k)
clusterAssment = mat(zeros((m,2)))
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = inf
minIdex = -1
for j in range(k):
DistJI = Dist(centoids[j,:],DataSet[i,:])
if minDist > DistJI:
minDist = DistJI
minIdex = j
if clusterAssment[i,0] != minIdex:
clusterChanged = True
clusterAssment[i,:]=minIdex,minDist
for cent_i in range(k):
cluster = DataSet[nonzero(clusterAssment[:,0].A==cent_i)[0]]
'''
clusterAssment[:,0]:记录每个样本的聚类簇
clusterAssment[:,0].A:将其转化为行矩阵
nonzero(clusterAssment[:,0].A==cent_i)[0]:获得元素为Ture的样本下标
整句含义:获得数据集中聚类簇为cent_i的样本
'''
centoids[cent_i,:] = mean(cluster,axis=0)
return centoids,clusterAssment
def show(DataSet,k,centroids,clusterAssment):
from matplotlib import pyplot as plt
m,n = DataSet.shape
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', ', 'pr']
for i in range(m):
markIdex = int(clusterAssment[i,0])
plt.plot(DataSet[i,0],DataSet[i,1],mark[markIdex])
mark= ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', ', 'pb']
for i in range(k):
plt.plot(centroids[i,0],centroids[i,1],mark[i],markersize=20)
plt.show()
def main():
DataSet = random.rand(100,2)
DataSet = mat(DataSet)
k=4
myCentroids,clustAssing = kMeans(DataSet,k)
print('聚类的质心为')
print(myCentroids)
show(DataSet,k,myCentroids,clustAssing)
if __name__ == '__main__':
main()