一、参考《机器学习与实践》
K均值聚类是一种通过距离公式把属性相近对象划分到同一类的算法,此算法对异常点和质心的选取较为敏感。二分k均值聚类是K均值聚类改进(还有一种是K中值聚类),是一种层次聚类方法,以K均值聚类为基础,通过比较和方差得到最好的分法。
K均值聚类步骤
1.得到K个数据范围类的随机质心
2.把数据划分给最近的质心得到K个簇
3.通过每个簇的平均值算出新的质点
4.重复2和3直到不再改变
二分K均值聚类
1.初始化质心和并算出每个点到质心的距离
2.分成两个簇后计算和方差选取最小误差为最好的分配方法
3.重新分配簇和质心
4.重复2和3达到K个类
python代码实现:
import numpy as np
from matplotlib import pyplot as plt
data=np.array( [
[1.658985, 4.285136],
[-3.453687, 3.424321],
[4.838138, -1.151539],
[-5.379713, -3.362104],
[0.972564, 2.924086],
[-3.567919, 1.531611],
[0.450614, -3.302219],
[-3.487105, -1.724432],
[2.668759, 1.594842],
[-3.156485, 3.191137],
[3.165506, -3.999838],
[-2.786837, -3.099354],
[4.208187, 2.984927],
[-2.123337, 2.943366],
[0.704199, -0.479481],
[-0.392370, -3.963704],
[2.831667, 1.574018],
[-0.790153, 3.343144],
[2.943496, -3.357075],
[-3.195883, -2.283926],
[2.336445, 2.875106],
[-1.786345, 2.554248],
[2.190101, -1.906020],
[-3.403367, -2.778288],
[1.778124, 3.880832],
[-1.688346, 2.230267],
[2.592976, -2.054368],
[-4.007257, -3.207066],
[2.257734, 3.387564],
[-2.679011, 0.785119],
[0.939512, -4.023563],
[-3.674424, -2.261084],
[2.046259, 2.735279],
[-3.189470, 1.780269],
[4.372646, -0.822248],
[-2.579316, -3.497576],
[1.889034, 5.190400],
[-0.798747, 2.185588],
[2.836520, -2.658556],
[-3.837877, -3.253815],
[2.096701, 3.886007],
[-2.709034, 2.923887],
[3.367037, -3.184789],
[-2.121479, -4.232586],
[2.329546, 3.179764],
[-3.284816, 3.273099],
[3.091414, -3.815232],
[-3.762093, -2.432191],
[3.542056, 2.778832],
[-1.736822, 4.241041],
[2.127073, -2.983680],
[-4.323818, -3.938116],
[3.792121, 5.135768],
[-4.786473, 3.358547],
[2.624081, -3.260715],
[-4.009299, -2.978115],
[2.493525, 1.963710],
[-2.513661, 2.642162],
[1.864375, -3.176309],
[-3.171184, -3.572452],
[2.894220, 2.489128],
[-2.562539, 2.884438],
[3.491078, -3.947487],
[-2.565729, -2.012114],
[3.332948, 3.983102],
[-1.616805, 3.573188],
[2.280615, -2.559444],
[-2.651229, -3.103198],
[2.321395, 3.154987],
[-1.685703, 2.939697],
[3.031012, -3.620252],
[-4.599622, -2.185829],
[4.196223, 1.126677],
[-2.133863, 3.093686],
[4.668892, -2.562705],
[-2.793241, -2.149706],
[2.884105, 3.043438],
[-2.967647, 2.848696],
[4.479332, -1.764772],
[-4.905566, -2.911070]
])
class cluster():
def __init__(self,data,classNum):
self.__data=np.array(data)
if self.__data.ndim==1:
self.__data = np.c_[np.ones(self.__data.shape[0]),self.__data]
self.__classNum=classNum
self.__elementNum, self.__dimension = data.shape
def __randCenter(self,data,classNum):
dimension=data.ndim
centroids=np.ones((classNum,dimension))
for i in range(dimension):
min=np.min(data[:,i])
max=np.max(data[:,i])
centroids[:,i]=(min+(max-min)*np.random.rand(classNum,1))[:,0]
return centroids
def dist(self,pA,pB):
return np.sqrt(np.sum(np.power(pA-pB,2)))
def __kMeans(self,data,classNum):
#初始化一个二维数组存储最小距离的index和最小距离
elementNum,dimension = data.shape
clusterList=np.zeros((elementNum,2))
indexList=np.array([-1]*elementNum)
centroids=self.__randCenter(data,classNum)
while True:
for i in range(elementNum):
minDist=np.inf
minIndex=-1
for j in range(classNum):
currentDist=self.dist(centroids[j],data[i])
if minDist > currentDist:
minDist=currentDist
minIndex=j
clusterList[i]=minIndex,minDist**2
for x in range(classNum):
#指定index输出self.__data[[0, 1, 2,]]
currentCluster=data[np.nonzero(clusterList[:,0]==x)]
if currentCluster.any():
centroids[x]=np.mean(currentCluster,axis=0)
#对比两个数组是否全部相同,如果相同则跳出循环
if (indexList==clusterList[:,0]).all():
break
else:
indexList = clusterList[:,0].copy()
return centroids,clusterList
def kMeans(self):
self.__centroids,self.__clusterList=self.__kMeans(self.__data,self.__classNum)
return self.__centroids,self.__clusterList
def bikMeans(self):
elementNum, dimension = data.shape
#初始化一个质心
centList=[np.mean(self.__data,axis=0)]
# 初始化一个二维数组存储最小距离的index和最小距离
clusterList = np.zeros((elementNum, 2))
#计算每个点到初始质心的距离
for i in range(elementNum):
clusterList[:,1][i]=self.dist(self.__data[i],centList[0])**2
while(len(centList)