简易Hierarchical Clustering(Python)

周志华《机器学习》中的层次聚类算法太简单了,这个算法里考虑到了多个子类聚成同一个父类的情况。
但是时间精力有限,没有办法实现完美的树状输出,Bonus中我会改进。

# -*- coding: utf-8 -*-
__author__ = 'Zhao'

import math
import numpy as np


def judge_element_append(list_input):
    if isinstance(list_input, list):
        for element in list_input:
            temp_class.append(element)
    else:
        temp_class.append(list_input)


def judge_element_delete(list_input, aim_list, match_num):
    for list_element in list_input:
        if isinstance(list_element, list):
            for element in list_element:
                if element == match_num:
                    del aim_list[list_input.index(list_element)]
        else:
            if list_element == match_num:
                del aim_list[list_input.index(list_element)]


group = []
group = input("Please input some numbers spit as blank:\n").split(" ")
group_num = len(group)

centroid = []
for i in range(group_num):
    centroid.append(group[i])

print("centroid is ", centroid, "\n")

times = 0
# auto-increment

while group_num != 1:
    group_num = len(group)
    print("the numbers of groups now is ", group_num, "\n")

    matrix = [[] for i in range(group_num)]

    for i in range(group_num):
        for j in range(group_num):
            distance = abs(int(centroid[i]) - int(centroid[j]))
            matrix[i].append(distance)

    print("distance matrix :")
    for i in range(group_num):
        print(matrix[i])
    # matrix contains the distance between every two elements

    print("------------")

    max_in_matrix = 0

    for i in range(group_num):
        for j in range(group_num):
            max_in_matrix = max_in_matrix if max_in_matrix > matrix[i][j] else matrix[i][j]

    # print(max_in_matrix)

    for i in range(group_num):
        for j in range(group_num):
            matrix[i][j] /= max_in_matrix
            matrix[i][j] = round(1 - matrix[i][j], 3) if round(1 - matrix[i][j], 3) != 1 else 0

    print("standard matrix :")
    for i in range(group_num):
        print(matrix[i])

    print("------------")
    # standard the matrix

    max_in_matrix = 0
    for i in range(group_num):
        for j in range(group_num):
            max_in_matrix = max_in_matrix if max_in_matrix > matrix[i][j] else matrix[i][j]

    print("max similarity in the matrix: ", max_in_matrix, "\n")
    # find the max similarity in this matrix
    if max_in_matrix == 0:
        temp_class = []

        for i in range(group_num):
            judge_element_append(group[i])

        # print("last temp_group = ", temp_class)
        for i in range(len(temp_class)):
            judge_element_delete(group, centroid, temp_class[i])
            judge_element_delete(group, group, temp_class[i])

        group.append(temp_class)

        print("[CONCLUSION]: ", group)
        break

    temp_class = []
    index1 = 0
    index2 = 0
    flag = 0

    for i in range(group_num):
        for j in range(group_num):
            if matrix[i][j] == max_in_matrix:
                index1 = i
                index2 = j
                flag = 1

                judge_element_append(group[i])
                judge_element_append(group[j])

        if flag == 1:
            break
    # find the first center index of new group

    group_num = len(group)

    # print(group_num)
    for i in range(group_num):
        if matrix[index1][i] == max_in_matrix and i != index2:
            judge_element_append(group[i])

    # group_num = len(group)
    for i in range(group_num):
        if matrix[index2][i] == max_in_matrix and i != index1:
            judge_element_append(group[i])

    times += 1
    print("after %dth clustering: " % times)
    # print("temp_group = ", temp_class)
    for i in range(len(temp_class)):

        judge_element_delete(group, centroid, temp_class[i])
        judge_element_delete(group, group, temp_class[i])

    group.append(temp_class)
    print("the new group is ", group)

    sum = 0
    for i in range(len(temp_class)):
        sum += int(temp_class[i])

    centroid.append(sum / len(temp_class))

    print("the new centroid is ", centroid, "\n")

    print("==========EHD OF ONE CLUSTERING==========\n")

你可能感兴趣的:(简易Hierarchical Clustering(Python))