几种聚类算法的对比实验

几种聚类算法的对比实验

聚类方法是属于无标签的无监督学习方法。其他常见的无监督学习还有密度估计,异常检测等。聚类就是对大量未知标注的数据集,按照数据的内在相似性将数据集划分为多个类别(在聚类算法中称为簇),使类别内的数据相似度高,二类别间的数据相似度低,我们可以使用聚类分析从我们的数据中获得一些有价值的见解,本文我们将研究几种常见的聚类算法,并讨论他们的优缺点。

kmeans

from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt

plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

# 导入数据集
data = pd.read_csv('xclara.csv')
# print(data.shape)
# data.head()

# 将csv文件中的数据转换为二维数组
f1 = data['V1'].values
f2 = data['V2'].values

# 按行的方式计算两个坐标点之间的距离
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)
X = np.array(list(zip(f1, f2)))

# 设定分区数
k = 4
# 随机获得中心点的X轴坐标
C_x = np.random.randint(0, np.max(X)-20, size=k)
# 随机获得中心点的Y轴坐标
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
# 用于保存中心点更新前的坐标
C_old = np.zeros(C.shape)
print(C)
# 用于保存数据所属中心点
clusters = np.zeros(len(X))
# 迭代标识位,通过计算新旧中心点的距离
iteration_flag = dist(C, C_old, 1)

fig, ax = plt.subplots()
plt.scatter(f1, f2, c='black', s=6)
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='blue')

tmp = 1
# 若中心点不再变化或循环次数不超过20次(此限制可取消),则退出循环
while iteration_flag.any() != 0 and tmp < 20:
    # 循环计算出每个点对应的最近中心点
    for i in range(len(X)):
        # 计算出每个点与中心点的距离
        distances = dist(X[i], C, 1)
        # print(distances)
        # 记录0 - k-1个点中距离近的点
        cluster = np.argmin(distances)
        # 记录每个样例点与哪个中心点距离最近
        clusters[i] = cluster

    # 采用深拷贝将当前的中心点保存下来
    # print("the distinct of clusters: ", set(clusters))
    C_old = deepcopy(C)
    # 从属于中心点放到一个数组中,然后按照列的方向取平均值
    for i in range(k):
        points = [X[j] for j in range(len(X)) if clusters[j] == i]
        # print(points)
        # print(np.mean(points, axis=0))
        C[i] = np.mean(points, axis=0)

    # 计算新旧节点的距离
    print('循环第%d次' % tmp)
    tmp = tmp + 1
    iteration_flag = dist(C, C_old, 1)
    print("新中心点与旧点的距离:", iteration_flag)

    # 最终结果图示
    colors = ['r', 'g', 'b', 'y', 'c', 'm']
    fig, ax = plt.subplots()
    # 不同的子集使用不同的颜色
    for i in range(k):
        points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
    ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='black')

plt.show()



优点:

  1. 简单直观,抑郁理解实现;
  2. 复杂度相对比较低,在K不是很大的情况下,Kmeans的计算时间相对很短;
  3. Kmean会产生紧密度比较高的簇,反映了簇内样本围绕质心的紧密程度的一种算法。
    缺点:
  4. 很难预测到准确的簇的数目;
  5. 对初始值设置很敏感(Kmeans++);
  6. Kmeans主要发现圆形或者球形簇,对不同形状和密度的簇效果不好;
  7. Kmeans对噪声和离群值非常敏感(Kmeadians对噪声和离群值不敏感)

LVQ

# -*- coding: utf-8 -*-
"""
Created on Tue Jan 29 20:22:18 2019

@author: zmddzf
"""
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt


class LVQ:
    """
    学习向量化算法实现
    attributes:
        train:LVQ
        predict: 预测一个样本所属的簇
    """

    def __init__(self, D, T, lr, maxEpoch):
        """
        初始化LVQ, 构造器
        :param D: 训练集, 格式为[[array, label],...]
        :param T: 原型向量类别标记
        :param lr: 学习率,0-1之间
        :param maxEpoch: 最大迭代次数
        """
        self.D = D
        self.T = T
        self.lr = lr
        self.maxEpoch = maxEpoch
        self.P = []
        # 初始化原型向量,随机选取
        for t in T:
            while True:
                p = random.choice(self.D)
                if p[1] != t:
                    pass
                else:
                    self.P.append(p)
                    break

    def __dist(self, p1, p2):
        """
        私有属性,计算距离
        :param p1: 向量1
        :param p2: 向量2
        :return dist: 距离
        """
        dist = np.linalg.norm(p1 - p2)
        return dist

    def train(self):
        """
        训练LVQ
        :return self.P: 训练后的原型向量
        """
        for epoch in tqdm(range(self.maxEpoch)):
            x = random.choice(self.D)  # 从训练集随机选取样本
            dist = []
            for p in self.P:
                dist.append(self.__dist(p[0], x[0]))  # 计算距离列表

            t = self.P[dist.index(min(dist))][1]  # 确定对应最小距离原型向量的类别
            if t == x[1]:
                # 若类别一致, 则靠拢
                self.P[dist.index(min(dist))][0] = self.P[dist.index(min(dist))][0] + self.lr * (
                            x[0] - self.P[dist.index(min(dist))][0])
            else:
                # 若类别不同, 则远离
                self.P[dist.index(min(dist))][0] = self.P[dist.index(min(dist))][0] - self.lr * (
                            x[0] - self.P[dist.index(min(dist))][0])
        return self.P

    def predict(self, x):
        """
        预测样本所属的簇
        :param x: 样本向量
        :return label: 样本的分类结果
        """
        dist = []
        for p in self.P:
            dist.append(self.__dist(p[0], x))
        label = self.P[dist.index(min(dist))][1]
        return label


# 生成实验数据集,数据集是两个正态分布二维点集
mu1 = 2;
sigma1 = 1
mu2 = 4;
sigma2 = 1
# 生成第一个正态分布
samples1 = np.array([np.random.normal(mu1, sigma1, 50), np.random.normal(mu1, sigma1, 50)])
samples1 = samples1.T.tolist()
label1 = [1 for i in range(50)]
# 生成第二个正态分布
samples2 = np.array([np.random.normal(mu2, sigma2, 50), np.random.normal(mu2, sigma2, 50)])
samples2 = samples2.T.tolist()
label2 = [0 for i in range(50)]
# 合并生成数据集
samples = samples1 + samples2
labels = label1 + label2

# 修改数据格式
data = []
for s, l in zip(samples, labels):
    data.append([np.array(s), l])

# 开始训练
lvq = LVQ(data, [0, 1], 0.1, 5000)
vector = lvq.train()

# 使用lvq分类
prediction = []
for i in data:
    prediction.append(lvq.predict(i[0]))

# 计算accuracy
accuracy = 0
for pred, label in zip(prediction, labels):
    if pred == label:
        accuracy += 1
accuracy = accuracy / len(data)
print("accuracy of LVQ:", accuracy)

# 画图展示原型向量和散点
plt.figure(figsize=(15, 10))
plt.scatter(np.array(samples).T[0], np.array(samples).T[1], c=labels)
plt.scatter(vector[0][0][0], vector[0][0][1], marker='*', s=300)
plt.scatter(vector[1][0][0], vector[1][0][1], marker='*', s=300)

plt.show()

LVQ算法其实也是一种基于竞争的学习,这点和无监督的SOM算法挺像的。LVD算法可以被视为一种网络,由输入层、竞争层、输出层组成。输入层很容易理解,就是接受样本的输入;竞争层可以被视为神经元之间的竞争,也就是原型向量之间的竞争,离得最近的神经元(原型向量)获胜,赢者通吃(winner-take-all);输出层负责输出分类结果。不论是如何理解这个算法,其实本质都是一样的,也就是同类靠拢、异类远离。

高斯混合聚类

import matplotlib.pyplot as plt
import numpy as np
import math

# 原始数据
x = [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243,
     0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719, 0.359, 0.339, 0.282,
     0.748, 0.714, 0.483, 0.478, 0.525, 0.751, 0.532, 0.473, 0.725, 0.446]

y = [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267,
     0.057, 0.099, 0.161, 0.198, 0.370, 0.042, 0.103, 0.188, 0.241, 0.257,
     0.232, 0.346, 0.312, 0.437, 0.369, 0.489, 0.472, 0.376, 0.445, 0.459]


# 矩阵测试
def test_matrix():
    sigma = np.mat([[0.2, 0.1], [0.0, 0.1]])
    sigma_Trans = sigma.T
    sigma_inverse = sigma.I
    print("sigma: {}".format(sigma))
    print("sigma Inverse: {}".format(sigma_inverse))
    print("sigma Transform: {}".format(sigma_Trans))

def gauss_density_probability(n, x, mu, sigma):
    sigma_det = np.linalg.det(sigma)
    divisor = pow(2 * np.pi, n / 2) * np.sqrt(sigma_det)
    exp = np.exp(-0.5 * (x - mu) * sigma.I * (x - mu).T)
    p = exp / divisor
    return p


# 后验概率测试
def test_posterior_probability():
    xx = np.mat([[x[0], y[0]]])
    mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
    sigma = np.mat([[0.1, 0.0], [0.0, 0.1]])
    det = np.linalg.det(sigma)
    print("det: {}".format(det))
    p_all = []
    for mu in mu_datasets:
        p = gauss_density_probability(2, xx, mu, sigma)
        p = p / 3
        p_all.append(p)
    p_mean = []
    for p in p_all:
        p_sum = np.sum(p_all)
        p = p / p_sum
        p_mean.append(p)
    print("probability: {}".format(p_mean[0]))


def posterior_probability(k, steps):

    alpha_datasets = [np.mat([1 / k]) for _ in range(k)]
    xx = [np.mat([[x[i], y[i]]]) for i in range(len(x))]
    mu_rand = np.random.randint(0, 30, (1, k))
    print("random: {}".format(mu_rand[0]))
    #     mu_datasets = [np.mat([[x[i], y[i]]]) for i in mu_rand[0]]
    mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
    sigma_datasets = [np.mat([[0.1, 0.0], [0.0, 0.1]]) for _ in range(k)]
    #     det = np.linalg.det(sigma_datasets[0])
    for step in range(steps):
        p_all = []
        # create cluster
        classification_temp = locals()
        for i in range(k):
            classification_temp['cluster_' + str(i)] = []
        # 后验概率分组
        for j in range(len(x)):
            p_group = []
            for i in range(k):
                mu = mu_datasets[i]
                p = gauss_density_probability(2, xx[j], mu, sigma_datasets[i])

                p = p * alpha_datasets[i].getA()[0][0]
                p_group.append(p)
            p_sum = np.sum(p_group)
            # 取最大后验概率
            max_p = max(p_group)
            max_index = p_group.index(max_p)
            # 最大后验概率聚类
            for i in range(k):
                if i == max_index:
                    classification_temp['cluster_' + str(max_index)].append(j)

            p_group = [p_group[i] / p_sum for i in range(len(p_group))]
            p_all.append(p_group)

        # 更新 mu, sigma, alpha
        mu_datasets = []
        sigma_datasets = []
        alpha_datasets = []

        for i in range(k):
            mu_temp_numerator = 0
            mu_temp_denominator = 0
            sigma_temp = 0
            alpha_temp = 0
            mu_numerator = [p_all[j][i] * xx[j] for j in range(len(x))]
            for mm in mu_numerator:
                mu_temp_numerator += mm

            mu_denominator = [p_all[j][i] for j in range(len(x))]
            for nn in mu_denominator:
                mu_temp_denominator += nn

            mu_dataset = mu_temp_numerator / mu_temp_denominator
            mu_datasets.append(mu_dataset)

            sigma = [p_all[j][i].getA()[0][0] * (xx[j] - mu_dataset).T * (xx[j] - mu_dataset) for j in range(len(x))]
            for ss in sigma:
                sigma_temp += ss
            sigma_dataset = sigma_temp / mu_temp_denominator
            sigma_datasets.append(sigma_dataset)

            alpha_new = [p_all[j][i] for j in range(len(x))]
            for alpha_nn in alpha_new:
                alpha_temp += alpha_nn
            alpha_dataset = alpha_temp / len(x)
            alpha_datasets.append(alpha_dataset)
    return p_all, mu_datasets, sigma_datasets, alpha_datasets, classification_temp


def cluster_visiualization(k, steps):
    post_probability, mu_datasets, sigma_datasets, alpha_datasets, classification_temp = posterior_probability(k, steps)
    plt.figure(figsize=(8, 8))
    markers = ['.', 's', '^', '<', '>', 'P']
    plt.xlim(0.1, 0.9)
    plt.ylim(0, 0.9)
    plt.grid()
    plt.scatter(x, y, color='r')

    plt.figure(figsize=(8, 8))
    for i in range(k):
        # 依据聚类获取对应数据,并显示
        xx = [x[num] for num in classification_temp['cluster_' + str(i)]]
        yy = [y[num] for num in classification_temp['cluster_' + str(i)]]

        plt.xlim(0.1, 0.9)
        plt.ylim(0, 0.9)
        plt.grid()
        plt.scatter(xx, yy, marker=markers[i])
    plt.savefig("./images/gauss_cluster.png", format="png")

if __name__ == "__main__":
    cluster_visiualization(3, 100)

算法本身不复杂,可能涉及到矩阵求导的部分会麻烦一点。西瓜数据集太小了,收敛非常快。然后,这个算法同样对于初值敏感。

DBSCAN

import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


# #############################################################################
# 产生样本数据
centers = [[1, 1], [-1, -1], [1, -1]]  # 生成聚类中心点
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,random_state=0) # 生成样本数据集

X = StandardScaler().fit_transform(X) # StandardScaler作用:去均值和方差归一化。且是针对每一个特征维度来做的,而不是针对样本。

# #############################################################################
# 调用密度聚类  DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
# print(db.labels_)  # db.labels_为所有样本的聚类索引,没有聚类索引为-1
# print(db.core_sample_indices_) # 所有核心样本的索引
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)  # 设置一个样本个数长度的全false向量
core_samples_mask[db.core_sample_indices_] = True #将核心样本部分设置为true
labels = db.labels_

# 获取聚类个数。(聚类结果中-1表示没有聚类为离散点)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

# 模型评估
print('估计的聚类个数为: %d' % n_clusters_)
print("同质性: %0.3f" % metrics.homogeneity_score(labels_true, labels))  # 每个群集只包含单个类的成员。
print("完整性: %0.3f" % metrics.completeness_score(labels_true, labels))  # 给定类的所有成员都分配给同一个群集。
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))  # 同质性和完整性的调和平均
print("调整兰德指数: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print("调整互信息: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# 使用黑色标注离散点
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:  # 聚类结果为-1的样本为离散点
        # 使用黑色绘制离散点
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)  # 将所有属于该聚类的样本位置置为true

    xy = X[class_member_mask & core_samples_mask]  # 将所有属于该类的核心样本取出,使用大图标绘制
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]  # 将所有属于该类的非核心样本取出,使用小图标绘制
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

DBSCAN的主要优点有:
1) 可以对任意形状的稠密数据集进行聚类,相对的,K-Means之类的聚类算法一般只适用于凸数据集。
2) 可以在聚类的同时发现异常点,对数据集中的异常点不敏感。
3) 聚类结果没有偏倚,相对的,K-Means之类的聚类算法初始值对聚类结果有很大影响。
DBSCAN的主要缺点有:
1)如果样本集的密度不均匀、聚类间距差相差很大时,聚类质量较差,这时用DBSCAN聚类一般不适合。
2)如果样本集较大时,聚类收敛时间较长,此时可以对搜索最近邻时建立的KD树或者球树进行规模限制来改进。
3) 调参相对于传统的K-Means之类的聚类算法稍复杂,主要需要对距离阈值ϵ,邻域样本数阈值MinPts联合调参,不同的参数组合对最后的聚类效果有较大影响。

AGNES

 #-*- coding:utf-8 -*-

import math
import pylab as pl

#数据集:每三个是一组分别是西瓜的编号,密度,含糖量
data = """
1,0.697,0.46,2,0.774,0.376,3,0.634,0.264,4,0.608,0.318,5,0.556,0.215,
6,0.403,0.237,7,0.481,0.149,8,0.437,0.211,9,0.666,0.091,10,0.243,0.267,
11,0.245,0.057,12,0.343,0.099,13,0.639,0.161,14,0.657,0.198,15,0.36,0.37,
16,0.593,0.042,17,0.719,0.103,18,0.359,0.188,19,0.339,0.241,20,0.282,0.257,
21,0.748,0.232,22,0.714,0.346,23,0.483,0.312,24,0.478,0.437,25,0.525,0.369,
26,0.751,0.489,27,0.532,0.472,28,0.473,0.376,29,0.725,0.445,30,0.446,0.459"""

#数据处理 dataset是30个样本(密度,含糖量)的列表
a = data.split(',')
dataset = [(float(a[i]), float(a[i+1])) for i in range(1, len(a)-1, 3)]

#计算欧几里得距离,a,b分别为两个元组
def dist(a, b):
    return math.sqrt(math.pow(a[0]-b[0], 2)+math.pow(a[1]-b[1], 2))

#dist_min
def dist_min(Ci, Cj):
    return min(dist(i, j) for i in Ci for j in Cj)
#dist_max
def dist_max(Ci, Cj):
    return max(dist(i, j) for i in Ci for j in Cj)
#dist_avg
def dist_avg(Ci, Cj):
    return sum(dist(i, j) for i in Ci for j in Cj)/(len(Ci)*len(Cj))

#找到距离最小的下标
def find_Min(M):
    min = 1000
    x = 0; y = 0
    for i in range(len(M)):
        for j in range(len(M[i])):
            if i != j and M[i][j] < min:
                min = M[i][j];x = i; y = j
    return (x, y, min)

#算法模型:
def AGNES(dataset, dist, k):
    #初始化C和M
    C = [];M = []
    for i in dataset:
        Ci = []
        Ci.append(i)
        C.append(Ci)
    for i in C:
        Mi = []
        for j in C:
            Mi.append(dist(i, j))
        M.append(Mi)
    q = len(dataset)
    #合并更新
    while q > k:
        x, y, min = find_Min(M)
        C[x].extend(C[y])
        C.remove(C[y])
        M = []
        for i in C:
            Mi = []
            for j in C:
                Mi.append(dist(i, j))
            M.append(Mi)
        q -= 1
    return C
#画图
def draw(C):
    colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm']
    for i in range(len(C)):
        coo_X = []    #x坐标列表
        coo_Y = []    #y坐标列表
        for j in range(len(C[i])):
            coo_X.append(C[i][j][0])
            coo_Y.append(C[i][j][1])
        pl.scatter(coo_X, coo_Y, marker='x', color=colValue[i%len(colValue)], label=i)

    pl.legend(loc='upper right')
    pl.show()

C = AGNES(dataset, dist_avg, 3)
draw(C)




AGNES算法比较简单,但一旦一组对象被合并,下一步的处理将在新生成的簇上进行。已做处理不能撤消,聚类之间也不能交换对象。增加新的样本对结果的影响较大。
假定在开始的时候有nn个簇,在结束的时候有11个簇,因此在主循环中有nn次迭代,在第ii次迭代中,我们必须在n−i+1n−i+1个簇中找到最靠近的两个进行合并。另外算法必须计算所有对象两两之间的距离,因此这个算法的复杂度为 O(n2)O(n2),该算法对于nn很大的情况是不适用的。

你可能感兴趣的:(机器学习,机器学习)