kmeans实例及代码

聚类和决策树一样,属于无监督学习。也就是说数据样本只有特征x,没有给定y。聚类的目的是找到样本特征潜在的类别,将同类别的样本放在一起。

kmeans的具体逻辑如下:
1.随机选取k个簇心;
2.对于每一个样例,计算其属于的类;
3.循环完所有的样例后,重新计算每个簇的簇心;
4.重复第二步第三部,直到簇心不再变化或达到最大迭代值。

import numpy as np
import matplotlib.pyplot as plt

#读取数据
def loaddate(filename):
    datamat = []
    fr = open(filename)
    for line in fr.readlines():
        datamat.append(map(float, line.strip().split('\t')))
    datamat = pd.DataFrame(datamat)
    return datamat

#计算欧式距离
def distance(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))

#产生随机簇心
def getcenter(data, k):
    _, n = data.shape
    center = pd.DataFrame(np.zeros([k, 2]))
    for i in range(n):
        minJ = np.min(data.iloc[:, i])
        maxJ = np.max(data.iloc[:, i])
        rangJ = float(maxJ - minJ)
        center.iloc[:, i] = minJ +rangJ * np.random.rand(k, 1)
    return center

#计算每个样例归属的簇,并重新计算簇心
def kmeans(data, k, maxiter):
    ceter = getcenter(data, k)
    m, n = data.shape
    position = pd.DataFrame(np.zeros((m, 2)), columns = ['dis', 'cindex'])
    #position第一列放置样例距簇心的距离,第二列放置样本归属的簇心
    itercount = 0
    #迭代次数,如果迭代次数超过最大迭代次数,则停止
    clusterchange = True
    #簇心是否发生变化
    while itercount < maxiter and clusterchange:
        itercount += 1
        clusterchange = False
        for i in range(m):
            minindex = 0
            mindist = np.inf
            #设置position的初始值
            for j in range(k):
                dis = distance(data.iloc[i, :], ceter.iloc[j, :])
                #计算欧式距离
                if dis < mindist:
                    minindex = j
                    mindist = dis
                    #替换掉初始化的数据
            if position.iloc[i, 1] != minindex:
                clusterchange = True
                #观察归属的簇是否发生变化
            position.iloc[i, :] = mindist, minindex
        for cent in range(k):
            ptscluster = data.iloc[list(position.loc[position['cindex'] == cent, 'cindex'].index), :]
            if ptscluster.shape[0] > 0:
                ceter.iloc[cent, :] = np.mean(ptscluster, axis = 0)
                #计算新的簇心
    return ceter
                

if __name__ == '__main__':
    file = loaddate('testSet.txt')
    ceter = kmeans(file, 2, 5)
    plt.scatter(file.iloc[:, 0], file.iloc[:, 1], marker = '*', c = 'b')
    plt.scatter(ceter.iloc[:, 0], ceter.iloc[:, 1], marker = 'o', c = 'r')
    plt.show()

你可能感兴趣的:(kmeans实例及代码)