层次聚类 AGENS算法及demo

层次聚类 AGENS算法

1 简介

层次聚类试图在不同层次对数据集进行划分,从而形成树形的聚类结构。数据集的划分可采用“自底向上”的聚合策略,也可采用“自顶向下”的分拆策略。
AGENS是一种自底向上聚合策略的层次聚类算法。它先将数据集中的每一个样本看作一个初始聚类,然后在算法运行的每一步找出距离最近的两个聚类簇进行合并,该过程不断重复,直至达到预设的聚类簇的个数。
来源: 知乎 - yyHaker - 层次聚类AGENS算法及其流程

2 算法流程

层次聚类 AGENS算法及demo_第1张图片
图片来源: 知乎 - yyHaker - 层次聚类AGENS算法及其流程

3 距离度量方式

3.1 最小距离

两个簇的最近样本的距离
d m i n ( C i , C j ) = min ⁡ x ∈ C i , z ∈ C j d i s t ( x , z ) d_{min}(C_i,C_j)=\min_{x \in C_i, z \in C_j} dist(x, z) dmin(Ci,Cj)=xCi,zCjmindist(x,z)

3.2 最大距离

两个簇的最远样本的距离
d m a x ( C i , C j ) = max ⁡ x ∈ C i , z ∈ C j d i s t ( x , z ) d_{max}(C_i,C_j)=\max_{x \in C_i, z \in C_j} dist(x, z) dmax(Ci,Cj)=xCi,zCjmaxdist(x,z)

3.3 平均距离

两个簇的所有样本对的距离平均
d a v g ( C i , C j ) = 1 ∣ C i ∣ ⋅ ∣ C j ∣ ∑ x ∈ C i ∑ z ∈ C j d i s t ( x , z ) d_{avg}(C_i,C_j)={1 \over {\vert C_i \vert}\cdot {\vert C_j \vert}}\sum_{x \in C_i}\sum_{z \in C_j}dist(x,z) davg(Ci,Cj)=CiCj1xCizCjdist(x,z)

3.4 质心距离

也称为重心距离, 重心的求法这里不再赘述

4 实验结果

假设k取5, 也就是我们假定有5个种类
层次聚类 AGENS算法及demo_第2张图片
而eps取100, minPts取2的DB SCAN对相同的数据聚类结果如下:

DB SCAN 可以参考我的另一篇博文 CSDN - Lin Shengfeng - DB SCAN

层次聚类 AGENS算法及demo_第3张图片
这里只是粗略贴了两个方法的结果. 不代表哪个方法更好(毕竟参数的调节也至关重要).
当然, 还有更加科学的评价方法, 挖个坑, 以后填.

附 代码

main.py

from dbscan_cluster import dbScanCluster
from config import *
from agens_cluster import agensCluster


if __name__ == '__main__':
    # DB SCAN聚类
    x, y, label, maxCluster = dbScanCluster(total_size=100, eps=100, min_pts=2)
    scatterDifferentForDB(x, y, label, maxCluster, color_list=['#8E05C2', '#A9333A', '#3E7C17', 'blue', '#F4A442', '#FF9292', '#1DB9C3', '#6D9886'], marker_list=['^', 'o', '1', 'p', 's'])
    # 层次聚类 AGENS
    agensClusterResult = agensCluster(5, x, y)
    scatterDifferentForAgens(agensClusterResult, color_list=['#8E05C2', '#A9333A', '#3E7C17', 'blue', '#F4A442', '#FF9292', '#1DB9C3', '#6D9886'], marker_list=['^', 'o', '1', 'p', 's'])

agens_cluster.py

from Cluster import Cluster
from point import point
from generate_random_points import *


class agens_cluster:
    def __init__(self, x_total, y_total, k_limit, method='g'):
        """
        构造函数,初始化agens_cluster对象
        :param x: 所有点的x轴坐标列表
        :param y: 所有点的y轴坐标列表
        :param k_limit: 超参数k, 最后要得到k类
        """
        self.points = [point(x_total[i], y_total[i]) for i in range(len(x_total))]
        self.k = k_limit
        self.d_method = method  # 距离度量默认采用质心距离
        self.clusters = [Cluster(self.points[i], i) for i in range(len(self.points))]  # 初始情况是把所有点单独看成一个簇

    def setDistanceMethod(self, method):
        """
        更改距离度量方式
        :param method: 合法取值为 'min', 'max', 'avg', 'g'
        :return: None
        """
        self.d_method = method

    def refreshClusterName(self):
        for i in range(len(self.clusters)):
            self.clusters[i].setName(i)

    def mainProcess(self):
        """
        https://zhuanlan.zhihu.com/p/50113029
        这篇文章讲述了详细的中间步骤,但是它维护了一个距离矩阵,这在我的demo中是不需要的。
        :return: 簇划分 C = {C1, C2, ..., Ck}
        """
        # 循环判断当前簇的个数是否已经跟预期(k)一样了,如果不一样就继续执行,一样则返回结果
        while len(self.clusters) > self.k:
            # 找出距离最近的两个簇Ci和Cj
            # 初始设置为第一个簇和最后一个簇
            closest_cluster_i = 0
            closest_cluster_j = len(self.clusters) - 1
            minimum_distance = self.clusters[closest_cluster_i].calculateDistance(self.clusters[closest_cluster_j], self.d_method)
            # 找出距离最小的两个簇
            for i in range(len(self.clusters)):
                for j in range(len(self.clusters) - 1, i, -1):
                    temp_distance = self.clusters[i].calculateDistance(self.clusters[j], self.d_method)  # 暂存距离
                    if temp_distance < minimum_distance:  # 找到了更小的距离
                        closest_cluster_i = i
                        closest_cluster_j = j
                        minimum_distance = temp_distance
            # 合并这两个簇
            # 1. j的元素全部移到i
            # 2. self.clusters中删除j
            self.clusters[closest_cluster_i].union(self.clusters[closest_cluster_j])
            # del self.clusters[closest_cluster_j]  # 这种方式删除并不推荐
            self.clusters.pop(closest_cluster_j)  # 推荐使用这个方法
        self.refreshClusterName()  # 清洗目前簇的名称/序号
        return self.clusters


def agensCluster(k, x, y, method='g', total_size=200, xScale=1000, yScale=1000):
    if len(x) == 0:
        x, y = generate(total=total_size, x_scale=xScale, y_scale=yScale)  # 生成随机点
    ac = agens_cluster(x, y, k, method)
    return ac.mainProcess()

dbscan_cluster.py

from point import *
import random
import queue
from generate_random_points import *
import matplotlib.pyplot as plt


class dbscan_cluster:
    def __init__(self, x_total, y_total, eps: int, min_pts: int) -> object:
        """
        :param eps: int, 表示半径
        :param min_pts: int, 最少要包含多少点
        """
        self.points = [point(x_total[i], y_total[i]) for i in range(len(x_total))]
        self.wait_for_pick = [i for i in range(len(self.points))]  # 可选取的点
        self.eps = eps
        self.min_pts = min_pts
        self.cur_cluster_num = 0  # 簇序号从0开始

    def printAllPoints(self):
        for p in self.points:
            print('({0}, {1})'.format(p.x_pos, p.y_pos))

    def getClusterNum(self):
        return self.cur_cluster_num

    def initPick(self):
        """
        :return: 在wait_for_pick中随机挑选一个点,返回点在points中的序号
        """
        cur_order = random.randint(0, len(self.wait_for_pick) - 1)  # wait_for_pick中的第cur_order个序号
        cur_pick = self.wait_for_pick[cur_order]  # 存入cur_pick
        self.wait_for_pick.pop(cur_order)  # 再删除
        return cur_pick

    def findNearPoint(self, cur_pick):
        """
        :param cur_pick: int, 表示当前选中的点序号
        :return: 与当前点距离在sqrt(eps^2)内的所有点序号
        """
        near_points = []
        for p in range(len(self.points)):
            if p == cur_pick:  # 同一个点, 跳过
                continue
            else:  # 其他点, 计算一下
                if self.points[cur_pick].calculateDistanceSquare(self.points[p]) <= self.eps ** 2:
                    near_points.append(p)
        return near_points

    def mainProcess(self):
        while len(self.wait_for_pick) > 0:
            # 先随机选取点, 并且在wait_for_pick里面删除其序号
            pre_points = []  # 标记这一个簇已经找到的点
            seeds = queue.Queue()  # 创建一个种子点队列
            cur_pick = self.initPick()  # 获取一个随机点
            near_points = self.findNearPoint(cur_pick)  # 获取随机点的邻近点
            # 当前点的邻近点数量小于 min_pts, 标记为噪声点
            if len(near_points) < self.min_pts:
                self.points[cur_pick].setPointType(NOISE)  # 当前的点是个噪声点
            # 大于 min_pts, 标记为核心点
            elif len(near_points) > self.min_pts:
                self.points[cur_pick].setPointType(CORE)  # 当前的点是核心点
                self.points[cur_pick].setClusterNum(self.cur_cluster_num)  # 设置簇号
                pre_points.append(cur_pick)  # 当前点后续不必再遍历
                for near_point_index in near_points:  # 把它的邻近点全部加入到seeds队列
                    seeds.put(near_point_index)  # 把邻近点在points中的下标加入seeds队列
                    # wait_for_pick队列中需要删除这个邻近点下标
                    if self.wait_for_pick.count(near_point_index) > 0:  # 这个点下标在wait_for_pick队列中
                        self.wait_for_pick.pop(self.wait_for_pick.index(near_point_index))  # 那就pop掉
                while not seeds.empty():  # 只有seeds中还有点数时才继续执行
                    head_point_index = seeds.get()  # 获取队列头部元素, get()方法会在取元素的同时删除队列中的它
                    pre_points.append(head_point_index)  # 后续不必再找它
                    if self.wait_for_pick.count(head_point_index) > 0:  # 这个点下标在wait_for_pick队列中
                        self.wait_for_pick.pop(self.wait_for_pick.index(head_point_index))  # 那就pop掉
                    self.points[head_point_index].setClusterNum(self.cur_cluster_num)  # 在这个seeds里能找到, 说明簇号还是一样
                    if self.points[head_point_index].getPointType() == UNVISITED:  # 还没标记过
                        cur_near_pts = self.findNearPoint(head_point_index)  # 查看一下它的邻近点
                        if len(cur_near_pts) > self.min_pts:  # 如果邻近点数量超过阈值
                            self.points[head_point_index].setPointType(CORE)  # 核心点
                            # 把邻近点全部加入seeds
                            for p in cur_near_pts:
                                if p not in pre_points:  # 没遍历过的邻近点才加入
                                    seeds.put(p)
                        else:  # 邻近点数量没到
                            self.points[head_point_index].setPointType(BOARD)  # 边界点
                    else:  # 已经被标记过
                        if self.points[head_point_index].getPointType() == NOISE:  # 之前是噪声
                            self.points[head_point_index].setPointType(BOARD)  # 现在改成边界点
                self.cur_cluster_num = self.cur_cluster_num + 1  # 下一个簇号

    def getAllCluster(self):
        """
        :return: 返回所有点的标签
        """
        return [self.points[i].getClusterNum() + 1 for i in range(len(self.points))]
        # +1 是为了防止标签为-1的点没有对应的颜色

    def printPointAndLabel(self):
        """
        这个方法打印点信息,主要用来测试
        :return: 无返回值
        """
        cluster_list = [[] for i in range(self.cur_cluster_num)]
        for pot in range(len(self.points)):
            print(self.points[pot].getClusterNum(), self.cur_cluster_num)
            if self.points[pot].getClusterNum() > -1:
                cluster_list[self.points[pot].getClusterNum()].append(pot)
        for li in range(len(cluster_list)):
            if len(cluster_list[li]) > 0:
                print('cluster #{0}'.format(li))
                for i in cluster_list[li]:
                    print('({0}, {1})'.format(self.points[i].x, self.points[i].y))
                print('--------------')


def dbScanCluster(total_size=200, xScale=1000, yScale=1000, eps=80, min_pts=3):
    # 目前只支持生成随机点
    x, y = generate(total=total_size, x_scale=xScale, y_scale=yScale)  # 生成随机点
    db = dbscan_cluster(x, y, eps, min_pts)
    db.mainProcess()
    label = db.getAllCluster()
    maxCluster = db.getClusterNum()
    return x, y, label, maxCluster

Cluster.py

"""
Cluster类:
属性:
- 簇内点集合
- 簇名
方法:
- 合并另一个簇
- 计算质心
- 计算与另一个簇的距离
- 提供点集合
- 计算簇与簇间的最大点距离
- 计算簇与簇间的最小点距离
- 计算簇与簇间的质心距离
- 计算簇与簇间的平均点距离(与质心点距离不一样!!!)
"""
import point


class Cluster:
    def __init__(self, pt, name):
        self.points = [pt]
        self.name = name

    def setName(self, name):
        self.name = name

    def getName(self):
        return self.name

    def getPoints(self):
        return self.points

    def union(self, another_cluster):
        self.points.extend(another_cluster.getPoints())

    def getCore(self):
        """
        计算质心坐标
        :return: 质心坐标(tuple)
        """
        total_x = 0
        total_y = 0
        for p in self.points:
            total_x = total_x + p.x_pos
            total_y = total_y + p.y_pos
        core = (total_x / len(self.points), total_y / len(self.points))
        return core

    def calcCoreDistance(self, another_cluster):
        """
        计算两个簇的质心距离
        :param another_cluster: 另一个簇
        :return: 两个簇的质心距离
        """
        myCore = self.getCore()
        anotherCore = another_cluster.getCore()
        return ((myCore[0] - anotherCore[0]) ** 2 + (myCore[1] - anotherCore[1]) ** 2) ** 0.5

    def calcMinDistance(self, another_cluster):
        """
        计算两个簇的最小距离(两个簇之间最近的两个点的距离)
        :param another_cluster: 另一个簇
        :return: 两个簇的最小距离
        """
        minDis = -1  # 初始值 -1(非法值)
        for x in self.getPoints():
            for y in another_cluster.getPoints():
                if minDis == -1 or minDis > x.calculateDistanceSquare(y):
                    minDis = x.calculateDistanceSquare(y)
        return minDis ** 0.5  # 因为获取的是平方 所以要取根号

    def calcMaxDistance(self, another_cluster):
        """
        计算两个簇的最大距离(两个簇之间最远的两个点的距离)
        :param another_cluster: 另一个簇
        :return: 两个簇的最小距离平方!!!平方是为了方便计算和比较
        """
        maxDis = -1  # 初始值 -1(非法值)
        for x in self.getPoints():
            for y in another_cluster.getPoints():
                if maxDis == -1 or maxDis < x.calculateDistanceSquare(y):
                    maxDis = x.calculateDistanceSquare(y)
        return maxDis ** 0.5  # 因为获取的是平方 所以要取根号

    def calcAvgDistance(self, another_cluster):
        """
        计算两个簇的平均距离
        :param another_cluster: 另一个簇
        :return: 两个簇的平均距离!!!
        """
        totalDis = 0
        myPoints = self.getPoints()  # 本簇的点集合
        anotherPoints = another_cluster.getPoints()  # 另一个簇的点集合
        for x in myPoints:
            for y in anotherPoints:
                totalDis = totalDis + x.calculateDistanceSquare(y) ** 0.5  # 欧式距离
        return totalDis/(len(myPoints)*len(anotherPoints))

    def calculateDistance(self, another_cluster, method='g'):
        if method == 'avg':
            return self.calcAvgDistance(another_cluster)
        elif method == 'min':
            return self.calcMinDistance(another_cluster)
        elif method == 'max':
            return self.calcMaxDistance(another_cluster)
        else:  # method == 'g'
            return self.calcCoreDistance(another_cluster)

point.py

from config import *


class point:
    def __init__(self, x_pos, y_pos):
        self.x = x_pos
        self.y = y_pos
        self.point_type = UNVISITED  # 未被访问过
        self.cluster = NONE  # 还不属于任何簇

    @property
    def x_pos(self):
        return self.x

    @property
    def y_pos(self):
        return self.y

    def setPointType(self, t):
        self.point_type = t

    def getPointType(self):
        return self.point_type

    def setClusterNum(self, num):
        self.cluster = num

    def getClusterNum(self):
        return self.cluster

    def calculateDistanceSquare(self, another_point):
        return (self.x - another_point.x)**2 + (self.y - another_point.y)**2

config.py

import matplotlib.pyplot as plt
import numpy as np


# 点类型参数值:
UNVISITED: int = -1  # 未被访问过
NOISE: int = 0  # 噪声点
CORE: int = 1  # 核心点
BOARD: int = 2  # 边缘点

# 点所属簇参数值:
NONE: int = -1


# 为不同组别绘图的功能函数(因为有噪声点,所以是专供db scan聚类的方法)
def scatterDifferentForDB(x, y, label, cluster_num, color_list, marker_list):
    # 先根据label给不同的x和y分组
    x_group = [[] for i in range(cluster_num + 1)]
    y_group = [[] for i in range(cluster_num + 1)]
    for i in range(len(label)):
        x_group[label[i]].append(x[i])
        y_group[label[i]].append(y[i])
    fig, ax = plt.subplots()
    plt.scatter(np.array(x_group[0]), np.array(y_group[0]), c='black', label='NOISE')
    for cluster_order in range(1, len(x_group)):
        plt.scatter(np.array(x_group[cluster_order]), np.array(y_group[cluster_order]), c=color_list[cluster_order % len(color_list)], marker=marker_list[cluster_order % len(marker_list)], label=cluster_order)
    # plt.scatter(x, y, c=label, cmap='jet')
    plt.legend(bbox_to_anchor=(1.05, 0), loc=3, borderaxespad=0)
    # 让图例显示完全
    fig.subplots_adjust(right=0.8)
    # 显示图片
    plt.show()


# 为不同组别绘图的功能函数(无噪声点, 专供层次聚类)
def scatterDifferentForAgens(clusters, color_list, marker_list):
    x_group = [[] for i in range(len(clusters))]
    y_group = [[] for i in range(len(clusters))]
    for c in clusters:
        for i in range(len(c.getPoints())):
            x_group[c.getName()].append(c.getPoints()[i].x_pos)
            y_group[c.getName()].append(c.getPoints()[i].y_pos)
    fig, ax = plt.subplots()
    for cluster_order in range(len(x_group)):
        plt.scatter(np.array(x_group[cluster_order]), np.array(y_group[cluster_order]), c=color_list[cluster_order % len(color_list)], marker=marker_list[cluster_order % len(marker_list)], label=cluster_order)
    # plt.scatter(x, y, c=label, cmap='jet')
    plt.legend(bbox_to_anchor=(1.05, 0), loc=3, borderaxespad=0)
    # 让图例显示完全
    fig.subplots_adjust(right=0.8)
    # 显示图片
    plt.show()

你可能感兴趣的:(机器学习,聚类,算法,数据挖掘)