话不多说,直接上代码,每行代码都是人肉产出。
代码中提供了停止k-means继续聚类的条件,在__init__()函数入参指定:
def __init__(self, samples, k, loopLimit, maxDistance)
1)最大Loop次数,就是loopLimit
2)前后计算的两次质心集体的最大距离maxDist大于maxDistance
注:maxDist的详细含义详见代码中的CKM类的对象接口max_distance()
代码中k-means停止的条件:loop次数 > loopLimit 或者 maxDist <= maxDistance
初始质心的初始化在此处就简单随机选取了样本集中的k个
但是初始质心的初始化很重要,初始值不同会影响聚类的结果,后面会用一篇文章详解:初始质心的最佳初始化方法
# -*- coding: utf-8 -*-
"""
Author:蔚蓝的天空Tom
Talk is cheap, show me the code
Aim:实现k-means算法, 此处用python class机制来实现
"""
import numpy as np
import matplotlib.pyplot as plt
class CKM(object):
''' this CKM class only '''
def __init__(self, samples, k, loopLimit, maxDistance):
self.samples = samples
self.k = k
self.loopLimit = loopLimit
self.maxDist = maxDistance
self.cents = []
self.clusters = {}
self.work()
return None
def distance(p1, p2):
'''计算p1和p2的欧式距离(欧几里得距离)
:param p1 = [x1,y1]
:param p2 = [x2,y2]
:return euclidean distances
'''
dist = np.sqrt(sum(np.power(p1-p2,2)))
return float('%.2f' % dist)
def max_distance(self, A, B):
'''计算A和B的最大距离
:param A = [[1,1],[2,2]]
:param B = [[3,5],[6,8]]
:return 最大距离
:Note [1,1]与[3,5]的距离4.47,[2,2]与[6,8]的距离7.21,则返回7.21
'''
distances = []
for i in range(np.shape(A)[0]):
p1,p2 = np.array(A[i]),np.array(B[i])
print('p1:',p1,'p2:',p2)
distances.append(np.sqrt(sum(np.power(p1-p2,2))))
print('distances:',distances, 'maxDistance:',max(distances))
return max(distances)
def initCentroids(self):
'''取samples的前k个样本作为初始质心'''
self.cents = [self.samples[i] for i in range(self.k)]
print('init centroids:\n', np.array(self.cents))
return
def calc_centroids(self):
'''用簇计算质心集合
:Note 计算原则是簇的所有样本的每种属性的均值,作为此簇质心的属性值
每个样本有n个属性,则此簇就有n个属性
:Note 质心集合结构形式为:centroids = [[1,1],[2,2]]
[1,1]是第一个簇的质心,[2,2]是第二个簇的质心
'''
print('\n\nold cents:', self.cents)
self.cents = []
for no in self.clusters.keys():
samples = np.array(self.clusters[no])
cent,n = [], np.shape(samples)[0]
for i in range(np.shape(samples)[1]):
attr = sum(samples[:,i])*1.0/n
cent.append(attr)
self.cents.append(cent)
print('new cents:', self.cents)
return
def build_Clusters(self):
'''用质心构建簇
:Note 簇用字典存储, key是簇序号, value是簇样本集, 结构形式为:
{0: [[1, 1]],
1: [[2, 2], [3, 1], [6, 4], [7, 5], [8, 4]]}
'''
print('old clusters:', self.clusters)
self.clusters = {}
for e in self.samples:
distances = [np.sqrt(sum(np.power(e-c,2))) for c in self.cents]
print('distances:', distances)
cid = np.argmin(distances)#cluster id
if cid not in self.clusters.keys():
self.clusters[cid] = []
self.clusters[cid].append(list(e))
print('new clusters:\n', self.clusters)
return
def work(self):
'''km worker'''
self.initCentroids()
#如果每个新质心和对应的旧质心距离小于1,就停止聚类
times, maxDist = self.loopLimit, self.maxDist + 1
while (times>0) and (maxDist>self.maxDist):
self.build_Clusters()
old_cents = self.cents
self.calc_centroids()
times, maxDist = times-1, self.max_distance(old_cents, self.cents)
return
def GetClusters(self):
return self.clusters
def GetCentroids(self):
return self.cents
def CKM_test():
samples = np.array([[1,1], #sample 1
[2,2], #sample 2
[3,1], #sample 3
[6,4],
[7,5],
[8,4]])
km = CKM(samples, k = 2, loopLimit=5, maxDistance=1)
#绘制聚类后的样本集
plt.figure(0)
color = ['red','blue']
#绘制簇
for cid in km.clusters.keys():#cid是簇序号
samples = np.array(km.clusters[cid])
plt.scatter(samples[:,0], samples[:,1],c=color[cid])
#绘制质心
for cid in range(np.shape(km.cents)[1]):#cid是簇序号
cent = km.cents[cid]
plt.scatter(cent[0],cent[1],c=color[cid],marker='+')
plt.show()
if __name__=='__main__':
CKM_test()
runfile('C:/Program Files/Windows NT/tom/km-tom.py', wdir='C:/Program Files/Windows NT/tom')
init centroids:
[[1 1]
[2 2]]
old clusters: {}
distances: [0.0, 1.4142135623730951]
distances: [1.4142135623730951, 0.0]
distances: [2.0, 1.4142135623730951]
distances: [5.8309518948453007, 4.4721359549995796]
distances: [7.2111025509279782, 5.8309518948453007]
distances: [7.6157731058639087, 6.324555320336759]
new clusters:
{0: [[1, 1]], 1: [[2, 2], [3, 1], [6, 4], [7, 5], [8, 4]]}
old cents: [array([1, 1]), array([2, 2])]
new cents: [[1.0, 1.0], [5.2000000000000002, 3.2000000000000002]]
p1: [1 1] p2: [ 1. 1.]
p1: [2 2] p2: [ 5.2 3.2]
distances: [0.0, 3.4176014981270129] maxDistance: 3.41760149813
old clusters: {0: [[1, 1]], 1: [[2, 2], [3, 1], [6, 4], [7, 5], [8, 4]]}
distances: [0.0, 4.7413078364518793]
distances: [1.4142135623730951, 3.4176014981270129]
distances: [2.0, 3.1112698372208092]
distances: [5.8309518948453007, 1.1313708498984758]
distances: [7.2111025509279782, 2.545584412271571]
distances: [7.6157731058639087, 2.9120439557122069]
new clusters:
{0: [[1, 1], [2, 2], [3, 1]], 1: [[6, 4], [7, 5], [8, 4]]}
old cents: [[1.0, 1.0], [5.2000000000000002, 3.2000000000000002]]
new cents: [[2.0, 1.3333333333333333], [7.0, 4.333333333333333]]
p1: [ 1. 1.] p2: [ 2. 1.33333333]
p1: [ 5.2 3.2] p2: [ 7. 4.33333333]
distances: [1.0540925533894598, 2.1270741511391753] maxDistance: 2.12707415114
old clusters: {0: [[1, 1], [2, 2], [3, 1]], 1: [[6, 4], [7, 5], [8, 4]]}
distances: [1.0540925533894598, 6.8637534273246663]
distances: [0.66666666666666674, 5.5176484524156164]
distances: [1.0540925533894598, 5.2068331172711027]
distances: [4.8074017006186525, 1.0540925533894596]
distances: [6.2003584125794244, 0.66666666666666696]
distances: [6.5659052011974035, 1.0540925533894596]
new clusters:
{0: [[1, 1], [2, 2], [3, 1]], 1: [[6, 4], [7, 5], [8, 4]]}
old cents: [[2.0, 1.3333333333333333], [7.0, 4.333333333333333]]
new cents: [[2.0, 1.3333333333333333], [7.0, 4.333333333333333]]
p1: [ 2. 1.33333333] p2: [ 2. 1.33333333]
p1: [ 7. 4.33333333] p2: [ 7. 4.33333333]
distances: [0.0, 0.0] maxDistance: 0.0
(end)