以下实现KMeans++算法的Step1:
import random
import numpy
from numpy import mat, shape, zeros, ones
def cent_improve(data_set: numpy.matrix, k: int) -> numpy.matrix:
m, n = shape(data_set) # 数据集的行 列
centroids = mat(zeros((k, n))) # 初始化 聚类中心矩阵
d_mat = mat(ones((m, 1)) * numpy.inf) # 初始化 距离向量 为无穷大
centroids[0, :] = data_set[random.choice(range(m)), :] # 随机选取第一个聚类中心
r = 1
while r < k:
for i in range(m): # 更新 距离向量
d = numpy.linalg.norm(data_set[i, :] - centroids[r - 1, :])
if d < d_mat[i, 0]:
d_mat[i, 0] = d
d_mat_square = numpy.power(d_mat, 2) # type: numpy.matrix # 列向量 m x 1 计算 距离平方向量
p = d_mat_square / numpy.sum(d_mat_square) # type: numpy.matrix # 列向量 m x 1 计算 概率向量
p_cumsum = numpy.cumsum(p) # type: numpy.matrix # 行向量 1 x m 计算 概率累加向量
random_value = random.random() # 产生 (0, 1) 之间的随机数
for index in range(m): # 选择 下一个 聚类中心
if random_value < p_cumsum[0, index]:
centroids[r, :] = data_set[index, :]
break
r += 1
return centroids
testdata = mat([
[-1.26, 0.46],
[-1.15, 0.49],
[-1.19, 0.36],
[-1.33, 0.28],
[-1.06, 0.22],
[-1.27, 0.03],
[-1.28, 0.15],
[-1.06, 0.08],
[-1.00, 0.38],
[-0.44, 0.29],
[-0.37, 0.45],
[-0.22, 0.36],
[-0.34, 0.18],
[-0.42, 0.06],
[-0.11, 0.12],
[-0.17, 0.32],
[-0.27, 0.08],
[-0.49, -0.34],
[-0.39, -0.28],
[-0.40, -0.45],
[-0.15, -0.33],
[-0.15, -0.21],
[-0.33, -0.30],
[-0.23, -0.45],
[-0.27, -0.59],
[-0.61, -0.65],
[-0.61, -0.53],
[-0.52, -0.53],
[-0.42, -0.56],
[-1.39, -0.26],
])
print(cent_improve(testdata, 3))