python-使用numpy库方法实现kmeans算法

python-使用numpy库方法实现kmeans算法

在老师的带领下,重新用numpy实现了一下kmeans算法


import random
import matplotlib.pyplot as plt
import numpy as np
import time

class KMeans():

    def __init__(self, k=1):
        '''
        :param k: k代表分类数
        '''
        self.__k = k
        self.__data = None         # 存放原始数据,转为numpy类型
        self.__pointCenter = None  # 存放中心点,第一次获得的中心点通过随机方式在__data里随机出来,numpy类型
        self.__result = []       # 存放分类结果
        for i in range(k):
            self.__result.append([])  # [[],[],[],[],[]]
            pass
        pass

    def fit(self, data, threshold, times=50000):
        '''
        进行模型训练
        :param data: 训练数据
        :param threshold: 阈值,退出条件
        :return:
        '''
        self.__data = data    #得到测试数据
        self.randomCenter()   #得到中心点
        #print(self.__pointCenter)
        centerDistance = self.calPointCenterDistance(self.__pointCenter, self.__data)  
        #计算中心点与各个点之间的距离

        # 对原始数据进行分类,将每个点分到离它最近的中心点
        i = 0
        for temp in centerDistance:
            index = np.argmin(temp)  #得到最小值索引
            self.__result[index].append(self.__data[i])  #进行分类
            i += 1
            pass
        # 打印分类结果
        # print(self.__result)
        oldCenterPoint = self.__pointCenter
        newCenterPoint = self.calNewPointCenter(self.__result)

        while np.sum(np.sum((oldCenterPoint -  newCenterPoint)**2, axis=1)**0.5)/self.__k > threshold:
        #判断新旧中心点的差值
            times -= 1
            result = []
            for i in range(self.__k):
                result.append([])
                pass
            # 保存上次的中心点
            oldCenterPoint = newCenterPoint
            centerDistance = self.calPointCenterDistance(newCenterPoint, self.__data)

            # 对原始数据进行分类,将每个点分到离它最近的中心点
            i = 0
            for temp in centerDistance:
                index = np.argmin(temp)
                result[index].append(self.__data[i]) 
                i += 1
                pass

            newCenterPoint = self.calNewPointCenter(result)

            self.__result = result
            pass
        self.__pointCenter = newCenterPoint
        return newCenterPoint, self.__result
        pass
        
	#计算中心点与各个点之间的距离
    def calPointCenterDistance(self, center, data):
        centerDistance = []
        flag = False
        for temp in data:
            centerDistance.append([np.sum((center - temp) ** 2, axis=1) ** 0.5])#使用numpy广播
            pass
        # print(centerDistance)
        return np.array(centerDistance)
        pass

    def calNewPointCenter(self, result):
        '''
        计算新的中心点
        '''
        newCenterPoint = None
        flag = False
        for temp in result:
            temps = np.array(temp)  #转为numpy类型
            point = np.mean(temps, axis=0)   #求取平均值,axis=0 即通过行轴求取平均值
            if not flag:
                newCenterPoint = np.array([point])
                flag = True
                pass
            else:
                newCenterPoint = np.vstack((newCenterPoint, point))  #添加生成的中心点
            pass
        # print(newCenterPoint)
        return newCenterPoint
        pass

    def randomCenter(self):
        '''
        从原始的__data里随机出最开始进行计算的k个中心点
        :return:
        '''
        if not self.__pointCenter:
            index = random.randint(0, len(self.__data) - 1)  #生成随机数
            self.__pointCenter = np.array([self.__data[index]])  #生成第一个中心点
            pass

        while len(self.__pointCenter) < self.__k:
            # 随机一个索引
            index = random.randint(0, len(self.__data) - 1)
            # 判断中心点是否重复,如果不重复,加入中心点列表
            if self.__data[index] not in self.__pointCenter:
                self.__pointCenter = np.vstack((self.__pointCenter, self.__data[index]))
                #self.__pointCenter必须存在值,np.vstack方法才会添加新的元素,否则出语法错误
                pass
            pass
        pass
    pass

if __name__ == "__main__":
    # 原始数据改为nunmpy结构
    data = np.random.randint(0, 100, 200000).reshape(100000, 2)
    # print(data)
    startTime = time.time()
    kmeans = KMeans(k=5)
    centerPoint, result = kmeans.fit(data, 0.0001)
    print(time.time() - startTime)
    print(centerPoint)
    plt.plot()
    plt.title("KMeans Classification")
    i = 0
    tempx = []
    tempy = []
    color = []
    for temp in result:
        temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
        color += [i] * len(temps[0])
        tempx += temps[0]
        tempy += temps[1]

        i += 2
        pass
    plt.scatter(tempx, tempy, c=color, s=30)
    plt.show()
    pass


**以下为效果展示图 **
python-使用numpy库方法实现kmeans算法_第1张图片

你可能感兴趣的:(python-使用numpy库方法实现kmeans算法)