1.层次聚类的简单介绍
层次聚类方法对给定的数据集进行层次的分解,直到某种条件满足或者达到最大迭代次数。具体又可分为:
凝聚的层次聚类(AGNES算法):一种自底向上的策略,首先将每个对象作为一个簇,然后合并这些原子簇为越来越大的簇(一般是计算所有簇的中心之间的距离,选取距离最小的两个簇合并),直到某个终结条件被满足或者达到最大迭代次数。
分裂的层次聚类(DIANA算法):采用自顶向下的策略,它首先将所有对象置于一个簇中,然后逐渐细分为越来越小的簇(一般是每次迭代分裂一个簇为两个),直到达到了某个终结条件或者达到最大迭代次数。
2.凝聚型层次聚类的代码如下:
本例中合并之后会把两个簇的中心求平均值,然后当成新的簇的中心。而且计算量比较大。可以说本例是最最最朴素的层次聚类的一种python实现。谁有好的方法,欢迎一起讨论!
代码如下:
import sys, os
import math
class Hierarchical:
def __init__(self, center, left = None, right = None, flag = None, distance = 0.0):
self.center = center
self.left = left
self.right = right
self.flag = flag
self.distance = distance
def traverse(node):
if node.left == None and node.right == None:
return [node.center]
else:
return traverse(node.left) + traverse(node.right)
def distance(v1, v2):
if len(v1) != len(v2):
print sys.stderr, "invalid v1 and v2 !"
sys.exit(1)
distance = 0
for i in range(len(v1)):
distance += (v1[i] - v2[i]) ** 2
distance = math.sqrt(distance)
return distance
def hcluster(data, n):
if len(data) <= 0:
print sys.stderr, "invalid data"
sys.exit(1)
clusters = [Hierarchical(data[i], flag = i) for i in range(len(data))]
distances = {}
min_id1 = None
min_id2 = None
currentCluster = -100
while(len(clusters) > n):
minDist = 1000000000000
for i in range(len(clusters) - 1):
for j in range(i + 1, len(clusters)):
# save distance, pick up speed
if distances.get((clusters[i].flag, clusters[j].flag)) == None:
distances[(clusters[i].flag, clusters[j].flag)] = distance(clusters[i].center, clusters[j].center)
if distances[(clusters[i].flag, clusters[j].flag)] <= minDist:
min_id1 = i
min_id2 = j
minDist = distances[(clusters[i].flag, clusters[j].flag)]
if min_id1 != None and min_id2 != None and minDist != 1000000000000:
newCenter = [(clusters[min_id1].center[i] + clusters[min_id2].center[i])/2 for i in range(len(clusters[min_id2].center))]
newFlag = currentCluster
currentCluster -= 1
newCluster = Hierarchical(newCenter, clusters[min_id1], clusters[min_id2], newFlag, minDist)
del clusters[min_id2]
del clusters[min_id1]
clusters.append(newCluster)
finalCluster = [traverse(clusters[i]) for i in range(len(clusters))]
return finalCluster
def loadData(filename):
infile = open(filename, 'r')
line = infile.readline()
dataList = []
tempList = []
while line:
lineArr = line.strip().split()
if len(lineArr) < 2:
line = infile.readline()
continue
for i in range(len(lineArr)):
tempList.append(float(lineArr[i]))
dataList.append(tempList)
tempList = []
line = infile.readline()
return dataList
if __name__ == '__main__':
data = [[123,321,434,4325,345345],[23124,141241,434234,9837489,34743],\
[128937,127,12381,424,8945],[322,4348,5040,8189,2348],\
[51249,42190,2713,2319,4328],[13957,1871829,8712847,34589,30945],\
[1234,45094,23409,13495,348052],[49853,3847,4728,4059,5389]]
finalCluster = hcluster(data, 3)
print finalCluster
data = loadData("testSet2.txt")
finalCluster = hcluster(data, 3)
print finalCluster
参考博客:http://www.cnblogs.com/coser/archive/2013/04/10/3013044.html。,然后根据自己的理解实现了一遍。