hierarchal cluster (层次聚类,single/complete linkage)

看网上似乎没有层次聚类关于single/comlplete linkage只用numpy的轮子,于是根据作业需求造了一个。

虽然都是层次聚类,但是基于single/comlplete linkage的和average linkage的着实不太一样。首先从直观角度来讲,后者每一次合并后都得重新算一次新的簇的中心结点是什么,复杂度非常高。而前者只考虑初始叶结点之间的距离作为最终所有簇间距的评估标准。


# Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id[0],nodes[j].id[0])
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])


loop_times = 0
        while loop_times < 91:
            # 14(leaf nodes) + 13(no-leaf nodes) = 27
            if len(nodes)>=self.k:
            nodes_id1,nodes_id2 = rank_list[loop_times][0]
            nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
            # find the cluster's
            nodeptr1 = nodes1
            nodeptr2 = nodes2
            while nodeptr1.father!=None:
                nodeptr1 = nodeptr1.father
            while nodeptr2.father!=None:
                nodeptr2 = nodeptr2.father
            # if these two nodes have the same root
            # They're in the same cluster without merging
            if nodeptr1==nodeptr2:
            # merge

            new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
            newnode_id_num +=1
            nodeptr1.father = new_node
            nodeptr2.father = new_node
            # print new_node.distance
            print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
            loop_times +=1


# -*- coding: utf-8 -*
import numpy as np
import math

# calculate the euler disctance with two array
def euler_distance(a,b):
    dist = np.sqrt(np.sum(np.square(a-b)))
    return dist

# define the cluster class
class ClusterNode(object):
    #initialize the nodes
    def __init__(self,left=None,right=None,distance=-1,count=1,id=None,father=None):
        self.left = left
        self.right = right
        self.distance = distance
        self.count = count
        self.id = id
        self.father = father

class Hierarchical(object):
    # define the stop point
    def __init__(self,k=1):
        assert k>0
        self.k = k;
        self.labels = None
    def train(self,x):
        nodes = [ClusterNode(id=i)for i in enumerate(x)]
        newnode_id_num = 13
        nodes_len = len(nodes)
        distance_list = {}
        rank_list = []
        # dim
        points_num,features_num = np.shape(x)
        # initialize the labels
        self.labels = [-1]*points_num
        curr_clustid = -1

        # Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id[0],nodes[j].id[0])
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])
        # print rank_list
        # stop condition is assert k
        # each out loop just merge two parts
        loop_times = 0
        while loop_times < 91:
            # 14(leaf nodes) + 13(no-leaf nodes) = 27
            if len(nodes)>=self.k:
            nodes_id1,nodes_id2 = rank_list[loop_times][0]
            nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
            # find the cluster's
            nodeptr1 = nodes1
            nodeptr2 = nodes2
            while nodeptr1.father!=None:
                nodeptr1 = nodeptr1.father
            while nodeptr2.father!=None:
                nodeptr2 = nodeptr2.father
            # if these two nodes have the same root
            # They're in the same cluster without merging
            if nodeptr1==nodeptr2:
            # merge

            new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
            newnode_id_num +=1
            nodeptr1.father = new_node
            nodeptr2.father = new_node
            # print new_node.distance
            print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
            loop_times +=1
        # for node in reversed(nodes):
        #     print node.id
        self.nodes = nodes
    def Label(self):
        # From the last to the first to label these fucking nodes
        label = 0
        for node in reversed(self.nodes):
            label += 1

    # traversal the leaf nodes to label
    def leaf_traversal(self,node,label):
        if node.left == None and node.right == None:
            if self.labels[node.id[0]] == -1:
                self.labels[node.id[0]] = label
        if node.left:
        if node.right:
def loadDataSet(fileName):
    xArr = [];
    yArr = []
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')
        xonerow = []
        for i in range(len(curLine) - 1):


    return xArr, yArr

if __name__ =="__main__":
    train_x,train_y = loadDataSet('D:\untitled\Hierarchical.txt')
    Hierarchy = Hierarchical(k=27)
    print np.array(train_x).shape[0]

    print np.array(Hierarchy.labels)
    print train_y


('In loop ', 0, 'The single linkage is: ', 1.0)
('In loop ', 1, 'The single linkage is: ', 3.605551275463989)
('In loop ', 2, 'The single linkage is: ', 3.605551275463989)
('In loop ', 3, 'The single linkage is: ', 5.0990195135927845)
('In loop ', 4, 'The single linkage is: ', 5.385164807134504)
('In loop ', 5, 'The single linkage is: ', 5.916079783099616)
('In loop ', 6, 'The single linkage is: ', 6.4031242374328485)
('In loop ', 8, 'The single linkage is: ', 7.810249675906654)
('In loop ', 9, 'The single linkage is: ', 8.54400374531753)
('In loop ', 13, 'The single linkage is: ', 8.831760866327848)
('In loop ', 15, 'The single linkage is: ', 9.486832980505138)
('In loop ', 18, 'The single linkage is: ', 10.392304845413264)


[0 1 0 0 0 1 0 0 0 0 0 0 0 0]
[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]

