【机器学习】AGNES层次聚类算法

算法思想:

  1. 初始每个数据都是一个簇;
  2. 寻找每个簇之间的距离,获取距离列表;
  3. 合并距离最近的两个簇Ci,Cj,并删除Cj,更新距离列表,使簇个数减1;
  4. 重复第3步,直到簇个数等于所需个数k。

数据集来源:周志华西瓜书数据集4.0

1.读取文件:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')
data = data.values.tolist()

2.使用matplotlib.pyplot绘出原始数据

# 画出原始图像
fig, ax = plt.subplots()
plt.scatter([i[0] for i in data], [i[1] for i in data])
plt.show()

【机器学习】AGNES层次聚类算法_第1张图片

3.计算欧氏距离

'''
计算欧氏距离
'''

def calDist(a, b):
    a = np.array(a)
    b = np.array(b)
    dist = np.sqrt(np.dot((a - b), (a - b).T))

    return dist

4.使用最小距离法计算簇之间的最小距离

'''
计算簇之间的最小距离
'''


def calClusterMinDist(c1, c2):
    minDist = 1e5
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            if dist < minDist:
                minDist = dist

    return minDist

5.使用平均距离法计算簇之间的平均距离

'''
计算簇之间的平均距离
'''


def calClusterAvgDist(c1, c2):
    num = len(c1) * len(c2)
    sum_dist = 0
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            sum_dist += dist
    return sum_dist

6.获取簇之间的距离列表

'''
获取最小距离列表
'''


def getMinDistList(data, method):
    cluster_num = len(data)
    #     print("cluster_num",cluster_num)
    minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]
    for i in range(cluster_num):
        j = i + 1
        while j < cluster_num:
            #             print("data[i]:",data[i])
            #             print("data[j]:",data[j])
            if method == "minDist":  # 使用最小距离计算
                minDistList[i][j] = calClusterMinDist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            elif method == "avgDist":  # 使用平均距离计算
                minDistList[i][j] = calClusterAvgDist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            j += 1

    return minDistList

7.寻找最小值

'''
寻找距离列表中的最小值,用于合并簇以及删除
'''


def findMin(minDistList):
    row = len(minDistList)
    minDist = 1e5
    min_i = 0
    min_j = 0
    for i in range(row):
        for j in range(row):
            dist = minDistList[i][j]
            if dist < minDist and dist != 0:
                minDist = minDistList[i][j]
                min_i = i
                min_j = j

    return min_i, min_j, minDist

8.实现

'''
AGNES算法实现
'''


def AGNES(data, k, method):
    cluster_num = len(data)
    C = []
    for i in data:  # 添加数据
        tmp = [i]
        C.append(tmp)
    minDistList = getMinDistList(C, method)
    while cluster_num > k:
        i, j, minDist = findMin(minDistList)
        #         print(len(minDistList))
        #         print(i,j,minDist)
        C[i].extend(C[j])  # 合并
        del C[j]  # 删除
        minDistList = getMinDistList(C, method)
        cluster_num -= 1

    return C

9.程序入口

'''
程序入口
'''
if __name__ == "__main__":
    C_min = AGNES(data, 3, 'minDist')
    C_avg = AGNES(data, 3, 'avgDist')
    fig, ax = plt.subplots(nrows=2, ncols=1)
    ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')
    ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')
    ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')
    ax[0].set_title("使用最小距离进行聚类")

    ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')
    ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')
    ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')
    ax[1].set_title("使用平均距离进行聚类")

    fig.tight_layout()
    plt.show()

完整代码如下:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')
data = data.values.tolist()

# 画出原始图像
fig, ax = plt.subplots()
plt.scatter([i[0] for i in data], [i[1] for i in data])
plt.show()

'''
计算欧氏距离
'''

def calDist(a, b):
    a = np.array(a)
    b = np.array(b)
    dist = np.sqrt(np.dot((a - b), (a - b).T))

    return dist


'''
计算簇之间的最小距离
'''


def calClusterMinDist(c1, c2):
    minDist = 1e5
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            if dist < minDist:
                minDist = dist

    return minDist


'''
计算簇之间的平均距离
'''


def calClusterAvgDist(c1, c2):
    num = len(c1) * len(c2)
    sum_dist = 0
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            sum_dist += dist
    return sum_dist


'''
获取最小距离列表
'''


def getMinDistList(data, method):
    cluster_num = len(data)
    #     print("cluster_num",cluster_num)
    minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]
    for i in range(cluster_num):
        j = i + 1
        while j < cluster_num:
            #             print("data[i]:",data[i])
            #             print("data[j]:",data[j])
            if method == "minDist":  # 使用最小距离计算
                minDistList[i][j] = calClusterMinDist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            elif method == "avgDist":  # 使用平均距离计算
                minDistList[i][j] = calClusterAvgDist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            j += 1

    return minDistList


'''
寻找距离列表中的最小值,用于合并簇以及删除
'''


def findMin(minDistList):
    row = len(minDistList)
    minDist = 1e5
    min_i = 0
    min_j = 0
    for i in range(row):
        for j in range(row):
            dist = minDistList[i][j]
            if dist < minDist and dist != 0:
                minDist = minDistList[i][j]
                min_i = i
                min_j = j

    return min_i, min_j, minDist


'''
AGNES算法实现
'''


def AGNES(data, k, method):
    cluster_num = len(data)
    C = []
    for i in data:  # 添加数据
        tmp = [i]
        C.append(tmp)
    minDistList = getMinDistList(C, method)
    while cluster_num > k:
        i, j, minDist = findMin(minDistList)
        #         print(len(minDistList))
        #         print(i,j,minDist)
        C[i].extend(C[j])  # 合并
        del C[j]  # 删除
        minDistList = getMinDistList(C, method)
        cluster_num -= 1

    return C


'''
程序入口
'''
if __name__ == "__main__":
    C_min = AGNES(data, 3, 'minDist')
    C_avg = AGNES(data, 3, 'avgDist')
    fig, ax = plt.subplots(nrows=2, ncols=1)
    ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')
    ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')
    ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')
    ax[0].set_title("使用最小距离进行聚类")

    ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')
    ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')
    ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')
    ax[1].set_title("使用平均距离进行聚类")

    fig.tight_layout()
    plt.show()

【机器学习】AGNES层次聚类算法_第2张图片

你可能感兴趣的:(机器学习,聚类,算法,python)