k均值聚类和二分k均值聚类

一、参考《机器学习与实践》

K均值聚类是一种通过距离公式把属性相近对象划分到同一类的算法,此算法对异常点和质心的选取较为敏感。二分k均值聚类是K均值聚类改进(还有一种是K中值聚类),是一种层次聚类方法,以K均值聚类为基础,通过比较和方差得到最好的分法。

K均值聚类步骤

1.得到K个数据范围类的随机质心

2.把数据划分给最近的质心得到K个簇

3.通过每个簇的平均值算出新的质点

4.重复2和3直到不再改变

二分K均值聚类

1.初始化质心和并算出每个点到质心的距离

2.分成两个簇后计算和方差选取最小误差为最好的分配方法

3.重新分配簇和质心

4.重复2和3达到K个类

python代码实现:

import numpy as np
from matplotlib import pyplot as plt
data=np.array( [
        [1.658985, 4.285136],
        [-3.453687, 3.424321],
        [4.838138, -1.151539],
        [-5.379713, -3.362104],
        [0.972564, 2.924086],
        [-3.567919, 1.531611],
        [0.450614, -3.302219],
        [-3.487105, -1.724432],
        [2.668759, 1.594842],
        [-3.156485, 3.191137],
        [3.165506, -3.999838],
        [-2.786837, -3.099354],
        [4.208187, 2.984927],
        [-2.123337, 2.943366],
        [0.704199, -0.479481],
        [-0.392370, -3.963704],
        [2.831667, 1.574018],
        [-0.790153, 3.343144],
        [2.943496, -3.357075],
        [-3.195883, -2.283926],
        [2.336445, 2.875106],
        [-1.786345, 2.554248],
        [2.190101, -1.906020],
        [-3.403367, -2.778288],
        [1.778124, 3.880832],
        [-1.688346, 2.230267],
        [2.592976, -2.054368],
        [-4.007257, -3.207066],
        [2.257734, 3.387564],
        [-2.679011, 0.785119],
        [0.939512, -4.023563],
        [-3.674424, -2.261084],
        [2.046259, 2.735279],
        [-3.189470, 1.780269],
        [4.372646, -0.822248],
        [-2.579316, -3.497576],
        [1.889034, 5.190400],
        [-0.798747, 2.185588],
        [2.836520, -2.658556],
        [-3.837877, -3.253815],
        [2.096701, 3.886007],
        [-2.709034, 2.923887],
        [3.367037, -3.184789],
        [-2.121479, -4.232586],
        [2.329546, 3.179764],
        [-3.284816, 3.273099],
        [3.091414, -3.815232],
        [-3.762093, -2.432191],
        [3.542056, 2.778832],
        [-1.736822, 4.241041],
        [2.127073, -2.983680],
        [-4.323818, -3.938116],
        [3.792121, 5.135768],
        [-4.786473, 3.358547],
        [2.624081, -3.260715],
        [-4.009299, -2.978115],
        [2.493525, 1.963710],
        [-2.513661, 2.642162],
        [1.864375, -3.176309],
        [-3.171184, -3.572452],
        [2.894220, 2.489128],
        [-2.562539, 2.884438],
        [3.491078, -3.947487],
        [-2.565729, -2.012114],
        [3.332948, 3.983102],
        [-1.616805, 3.573188],
        [2.280615, -2.559444],
        [-2.651229, -3.103198],
        [2.321395, 3.154987],
        [-1.685703, 2.939697],
        [3.031012, -3.620252],
        [-4.599622, -2.185829],
        [4.196223, 1.126677],
        [-2.133863, 3.093686],
        [4.668892, -2.562705],
        [-2.793241, -2.149706],
        [2.884105, 3.043438],
        [-2.967647, 2.848696],
        [4.479332, -1.764772],
        [-4.905566, -2.911070]
        ])

class cluster():
    def __init__(self,data,classNum):
        self.__data=np.array(data)
        if self.__data.ndim==1:
            self.__data = np.c_[np.ones(self.__data.shape[0]),self.__data]
        self.__classNum=classNum
        self.__elementNum, self.__dimension = data.shape
    def __randCenter(self,data,classNum):
        dimension=data.ndim
        centroids=np.ones((classNum,dimension))
        for i in range(dimension):
            min=np.min(data[:,i])
            max=np.max(data[:,i])
            centroids[:,i]=(min+(max-min)*np.random.rand(classNum,1))[:,0]
        return centroids
    def dist(self,pA,pB):
        return np.sqrt(np.sum(np.power(pA-pB,2)))
    def __kMeans(self,data,classNum):
        #初始化一个二维数组存储最小距离的index和最小距离
        elementNum,dimension = data.shape
        clusterList=np.zeros((elementNum,2))
        indexList=np.array([-1]*elementNum)
        centroids=self.__randCenter(data,classNum)
        while True:
            for i in range(elementNum):
                minDist=np.inf
                minIndex=-1
                for j in range(classNum):
                    currentDist=self.dist(centroids[j],data[i])
                    if minDist > currentDist:
                        minDist=currentDist
                        minIndex=j
                clusterList[i]=minIndex,minDist**2
            for x in range(classNum):
                #指定index输出self.__data[[0, 1, 2,]]
                currentCluster=data[np.nonzero(clusterList[:,0]==x)]
                if currentCluster.any():
                    centroids[x]=np.mean(currentCluster,axis=0)
            #对比两个数组是否全部相同,如果相同则跳出循环
            if (indexList==clusterList[:,0]).all():
                break
            else:
                indexList = clusterList[:,0].copy()
        return centroids,clusterList
    def kMeans(self):
        self.__centroids,self.__clusterList=self.__kMeans(self.__data,self.__classNum)
        return self.__centroids,self.__clusterList
    def bikMeans(self):
        elementNum, dimension = data.shape
        #初始化一个质心
        centList=[np.mean(self.__data,axis=0)]
        # 初始化一个二维数组存储最小距离的index和最小距离
        clusterList = np.zeros((elementNum, 2))
        #计算每个点到初始质心的距离
        for i in range(elementNum):
            clusterList[:,1][i]=self.dist(self.__data[i],centList[0])**2
        while(len(centList)

 

你可能感兴趣的:(机器算法)