kmeans原理+python实现

kmeans
通过给任意个n维的数据已知k 是特征的个数,然后将任意个数据按最小的 n维中的欧式距离不断递归 划分给k个特征值,最后输出 我们运算得到的通过给任意个n维的数据得到的具体的k个特征值的n维坐标

import random
import math
# 定义对象帮助我们完成kneans
class FindList():
    def getCurrentResultList(self, currentList, currentRandom):
        resultList = []
        for temp in currentList:
            currentDistance = []
            for currentR in currentRandom:
                m1 = MathService()
                currentDistance.append(m1.distance(temp, currentR))
            resultList.append([currentDistance.index(min(currentDistance)), len(resultList)])
        resultList.sort()
        return resultList

    def getNewPoints(self, currentResultList, firstList, currentPoint, limitCondition):
        reultList = []
        # 把firstList里面的第一个装入currentDimensionCount 以后作为不同维度的累加和格式为【x1count,x2count,x3count,x4count......】
        currentDimensionCount =[i for i in firstList[currentResultList[0][1]] ]
        currentCount = 1
        for i in range(1, len(currentResultList)):
            if currentResultList[i-1][0] == currentResultList[i][0]:
                currentCount += 1
                # 注意下一行这个1 只要不更改currentResultList的格式这个1就永远不会变
                currentDimensionCount = [old + firstList[currentResultList[i][1]][xi] for old, xi in zip(currentDimensionCount, range(len(firstList[0])))]
            else:
                currentResultListDimensionCount = [i/currentCount for i in currentDimensionCount ]
                reultList.append(currentResultListDimensionCount)
                currentCount = 1
                currentDimensionCount = [firstList[currentResultList[i][1]][xi] for xi in range(len(firstList[0]))]
            if i ==len(currentResultList)-1:
                currentResultListDimensionCount = [i / currentCount for i in currentDimensionCount]
                reultList.append(currentResultListDimensionCount)
        if self.isEnd(reultList, currentPoint, limitCondition) == 0:
            print('最后的点是')
            print(currentPoint)
            return 0
        return reultList

    def isEnd(self, reultList, currentPoint, limitCondition):
        reultList = reultList[:]
        currentPoint = currentPoint[:]
        m1 = MathService()
        count = math.sqrt(sum( [m1.distance(x, y)**2 for x,y in zip(reultList, currentPoint) ]))
        if count > limitCondition:
            print('count:'+ str(count))
            return count
        else:
            return 0
# 定义数学类辅助我们
class MathService():
    # 根据最大值最小值生成随机整数
    def getRandomNum(self, minNum, maxNum):
        return random.randint(-minNum, maxNum)
    # 生成一个维度的一个列表
    def creatADimensionList(self, dimensionCount, minNum, maxNum):
        resultList = []
        for i in range(dimensionCount):
            resultList.append(self.getRandomNum(minNum, maxNum))
        return resultList
    # 生成指定维度的指定长度的列表
    def creatFirstList(self,dimensionCount =2 , listLength = 100, minNum =0, maxNum =100):
        return [self.creatADimensionList(dimensionCount, minNum, maxNum) for i in range(listLength)]
    # 获取第一次的5个随机数
    def getFirstRandom(self, listLength, randomCount, firstList):
        while True:
            resultList = []
            resultListIndex = []
            resultListIndex = [random.randint(0, listLength-1) for i in range(randomCount)]
            if len(set(resultListIndex)) == randomCount:
                resultListIndex = list(resultListIndex)
            resultList = [firstList[i] for i in resultListIndex]
            return resultList
    # 计算两个点之间的距离
    def distance(self, pointer1, pointer2):
        return math.sqrt(sum([(x1 - x2)**2 for x1, x2 in zip(pointer1, pointer2)]))

m1 = MathService()
dimensionCount = 2  # 维度
listLength = 100    # 模拟数据的个数
minNum = 0          # 模拟数据的最小值
maxNum = 100        # 模拟数据的最大值
randomCount = 5     # 模拟的特征点的个数
limitCondition = 0.001  # 模拟到最后的可以接受的误差值
# 生成随机列表
firstList = m1.creatFirstList(dimensionCount, listLength, minNum, maxNum)
# 生成第一我们要的 k个随机点
currentPoint = firstRandom = m1.getFirstRandom(listLength, randomCount, firstList)
f1 = FindList()

while True:
	# 获取将firstList 分成k类之后的list
    currentResultList = f1.getCurrentResultList(firstList, currentPoint)  # 注意这里一定是firstList
    #  获取新的k个特征点
    currentPoint = f1.getNewPoints(currentResultList, firstList, currentPoint, limitCondition)
    if currentPoint == 0:
        print(currentResultList)
        break

你可能感兴趣的:(算法)