聚类方法是属于无标签的无监督学习方法。其他常见的无监督学习还有密度估计,异常检测等。聚类就是对大量未知标注的数据集,按照数据的内在相似性将数据集划分为多个类别(在聚类算法中称为簇),使类别内的数据相似度高,二类别间的数据相似度低,我们可以使用聚类分析从我们的数据中获得一些有价值的见解,本文我们将研究几种常见的聚类算法,并讨论他们的优缺点。
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
# 导入数据集
data = pd.read_csv('xclara.csv')
# print(data.shape)
# data.head()
# 将csv文件中的数据转换为二维数组
f1 = data['V1'].values
f2 = data['V2'].values
# 按行的方式计算两个坐标点之间的距离
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
X = np.array(list(zip(f1, f2)))
# 设定分区数
k = 4
# 随机获得中心点的X轴坐标
C_x = np.random.randint(0, np.max(X)-20, size=k)
# 随机获得中心点的Y轴坐标
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
# 用于保存中心点更新前的坐标
C_old = np.zeros(C.shape)
print(C)
# 用于保存数据所属中心点
clusters = np.zeros(len(X))
# 迭代标识位,通过计算新旧中心点的距离
iteration_flag = dist(C, C_old, 1)
fig, ax = plt.subplots()
plt.scatter(f1, f2, c='black', s=6)
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='blue')
tmp = 1
# 若中心点不再变化或循环次数不超过20次(此限制可取消),则退出循环
while iteration_flag.any() != 0 and tmp < 20:
# 循环计算出每个点对应的最近中心点
for i in range(len(X)):
# 计算出每个点与中心点的距离
distances = dist(X[i], C, 1)
# print(distances)
# 记录0 - k-1个点中距离近的点
cluster = np.argmin(distances)
# 记录每个样例点与哪个中心点距离最近
clusters[i] = cluster
# 采用深拷贝将当前的中心点保存下来
# print("the distinct of clusters: ", set(clusters))
C_old = deepcopy(C)
# 从属于中心点放到一个数组中,然后按照列的方向取平均值
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
# print(points)
# print(np.mean(points, axis=0))
C[i] = np.mean(points, axis=0)
# 计算新旧节点的距离
print('循环第%d次' % tmp)
tmp = tmp + 1
iteration_flag = dist(C, C_old, 1)
print("新中心点与旧点的距离:", iteration_flag)
# 最终结果图示
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
# 不同的子集使用不同的颜色
for i in range(k):
points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='black')
plt.show()
优点:
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 29 20:22:18 2019
@author: zmddzf
"""
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
class LVQ:
"""
学习向量化算法实现
attributes:
train:LVQ
predict: 预测一个样本所属的簇
"""
def __init__(self, D, T, lr, maxEpoch):
"""
初始化LVQ, 构造器
:param D: 训练集, 格式为[[array, label],...]
:param T: 原型向量类别标记
:param lr: 学习率,0-1之间
:param maxEpoch: 最大迭代次数
"""
self.D = D
self.T = T
self.lr = lr
self.maxEpoch = maxEpoch
self.P = []
# 初始化原型向量,随机选取
for t in T:
while True:
p = random.choice(self.D)
if p[1] != t:
pass
else:
self.P.append(p)
break
def __dist(self, p1, p2):
"""
私有属性,计算距离
:param p1: 向量1
:param p2: 向量2
:return dist: 距离
"""
dist = np.linalg.norm(p1 - p2)
return dist
def train(self):
"""
训练LVQ
:return self.P: 训练后的原型向量
"""
for epoch in tqdm(range(self.maxEpoch)):
x = random.choice(self.D) # 从训练集随机选取样本
dist = []
for p in self.P:
dist.append(self.__dist(p[0], x[0])) # 计算距离列表
t = self.P[dist.index(min(dist))][1] # 确定对应最小距离原型向量的类别
if t == x[1]:
# 若类别一致, 则靠拢
self.P[dist.index(min(dist))][0] = self.P[dist.index(min(dist))][0] + self.lr * (
x[0] - self.P[dist.index(min(dist))][0])
else:
# 若类别不同, 则远离
self.P[dist.index(min(dist))][0] = self.P[dist.index(min(dist))][0] - self.lr * (
x[0] - self.P[dist.index(min(dist))][0])
return self.P
def predict(self, x):
"""
预测样本所属的簇
:param x: 样本向量
:return label: 样本的分类结果
"""
dist = []
for p in self.P:
dist.append(self.__dist(p[0], x))
label = self.P[dist.index(min(dist))][1]
return label
# 生成实验数据集,数据集是两个正态分布二维点集
mu1 = 2;
sigma1 = 1
mu2 = 4;
sigma2 = 1
# 生成第一个正态分布
samples1 = np.array([np.random.normal(mu1, sigma1, 50), np.random.normal(mu1, sigma1, 50)])
samples1 = samples1.T.tolist()
label1 = [1 for i in range(50)]
# 生成第二个正态分布
samples2 = np.array([np.random.normal(mu2, sigma2, 50), np.random.normal(mu2, sigma2, 50)])
samples2 = samples2.T.tolist()
label2 = [0 for i in range(50)]
# 合并生成数据集
samples = samples1 + samples2
labels = label1 + label2
# 修改数据格式
data = []
for s, l in zip(samples, labels):
data.append([np.array(s), l])
# 开始训练
lvq = LVQ(data, [0, 1], 0.1, 5000)
vector = lvq.train()
# 使用lvq分类
prediction = []
for i in data:
prediction.append(lvq.predict(i[0]))
# 计算accuracy
accuracy = 0
for pred, label in zip(prediction, labels):
if pred == label:
accuracy += 1
accuracy = accuracy / len(data)
print("accuracy of LVQ:", accuracy)
# 画图展示原型向量和散点
plt.figure(figsize=(15, 10))
plt.scatter(np.array(samples).T[0], np.array(samples).T[1], c=labels)
plt.scatter(vector[0][0][0], vector[0][0][1], marker='*', s=300)
plt.scatter(vector[1][0][0], vector[1][0][1], marker='*', s=300)
plt.show()
LVQ算法其实也是一种基于竞争的学习,这点和无监督的SOM算法挺像的。LVD算法可以被视为一种网络,由输入层、竞争层、输出层组成。输入层很容易理解,就是接受样本的输入;竞争层可以被视为神经元之间的竞争,也就是原型向量之间的竞争,离得最近的神经元(原型向量)获胜,赢者通吃(winner-take-all);输出层负责输出分类结果。不论是如何理解这个算法,其实本质都是一样的,也就是同类靠拢、异类远离。
import matplotlib.pyplot as plt
import numpy as np
import math
# 原始数据
x = [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243,
0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719, 0.359, 0.339, 0.282,
0.748, 0.714, 0.483, 0.478, 0.525, 0.751, 0.532, 0.473, 0.725, 0.446]
y = [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267,
0.057, 0.099, 0.161, 0.198, 0.370, 0.042, 0.103, 0.188, 0.241, 0.257,
0.232, 0.346, 0.312, 0.437, 0.369, 0.489, 0.472, 0.376, 0.445, 0.459]
# 矩阵测试
def test_matrix():
sigma = np.mat([[0.2, 0.1], [0.0, 0.1]])
sigma_Trans = sigma.T
sigma_inverse = sigma.I
print("sigma: {}".format(sigma))
print("sigma Inverse: {}".format(sigma_inverse))
print("sigma Transform: {}".format(sigma_Trans))
def gauss_density_probability(n, x, mu, sigma):
sigma_det = np.linalg.det(sigma)
divisor = pow(2 * np.pi, n / 2) * np.sqrt(sigma_det)
exp = np.exp(-0.5 * (x - mu) * sigma.I * (x - mu).T)
p = exp / divisor
return p
# 后验概率测试
def test_posterior_probability():
xx = np.mat([[x[0], y[0]]])
mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
sigma = np.mat([[0.1, 0.0], [0.0, 0.1]])
det = np.linalg.det(sigma)
print("det: {}".format(det))
p_all = []
for mu in mu_datasets:
p = gauss_density_probability(2, xx, mu, sigma)
p = p / 3
p_all.append(p)
p_mean = []
for p in p_all:
p_sum = np.sum(p_all)
p = p / p_sum
p_mean.append(p)
print("probability: {}".format(p_mean[0]))
def posterior_probability(k, steps):
alpha_datasets = [np.mat([1 / k]) for _ in range(k)]
xx = [np.mat([[x[i], y[i]]]) for i in range(len(x))]
mu_rand = np.random.randint(0, 30, (1, k))
print("random: {}".format(mu_rand[0]))
# mu_datasets = [np.mat([[x[i], y[i]]]) for i in mu_rand[0]]
mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
sigma_datasets = [np.mat([[0.1, 0.0], [0.0, 0.1]]) for _ in range(k)]
# det = np.linalg.det(sigma_datasets[0])
for step in range(steps):
p_all = []
# create cluster
classification_temp = locals()
for i in range(k):
classification_temp['cluster_' + str(i)] = []
# 后验概率分组
for j in range(len(x)):
p_group = []
for i in range(k):
mu = mu_datasets[i]
p = gauss_density_probability(2, xx[j], mu, sigma_datasets[i])
p = p * alpha_datasets[i].getA()[0][0]
p_group.append(p)
p_sum = np.sum(p_group)
# 取最大后验概率
max_p = max(p_group)
max_index = p_group.index(max_p)
# 最大后验概率聚类
for i in range(k):
if i == max_index:
classification_temp['cluster_' + str(max_index)].append(j)
p_group = [p_group[i] / p_sum for i in range(len(p_group))]
p_all.append(p_group)
# 更新 mu, sigma, alpha
mu_datasets = []
sigma_datasets = []
alpha_datasets = []
for i in range(k):
mu_temp_numerator = 0
mu_temp_denominator = 0
sigma_temp = 0
alpha_temp = 0
mu_numerator = [p_all[j][i] * xx[j] for j in range(len(x))]
for mm in mu_numerator:
mu_temp_numerator += mm
mu_denominator = [p_all[j][i] for j in range(len(x))]
for nn in mu_denominator:
mu_temp_denominator += nn
mu_dataset = mu_temp_numerator / mu_temp_denominator
mu_datasets.append(mu_dataset)
sigma = [p_all[j][i].getA()[0][0] * (xx[j] - mu_dataset).T * (xx[j] - mu_dataset) for j in range(len(x))]
for ss in sigma:
sigma_temp += ss
sigma_dataset = sigma_temp / mu_temp_denominator
sigma_datasets.append(sigma_dataset)
alpha_new = [p_all[j][i] for j in range(len(x))]
for alpha_nn in alpha_new:
alpha_temp += alpha_nn
alpha_dataset = alpha_temp / len(x)
alpha_datasets.append(alpha_dataset)
return p_all, mu_datasets, sigma_datasets, alpha_datasets, classification_temp
def cluster_visiualization(k, steps):
post_probability, mu_datasets, sigma_datasets, alpha_datasets, classification_temp = posterior_probability(k, steps)
plt.figure(figsize=(8, 8))
markers = ['.', 's', '^', '<', '>', 'P']
plt.xlim(0.1, 0.9)
plt.ylim(0, 0.9)
plt.grid()
plt.scatter(x, y, color='r')
plt.figure(figsize=(8, 8))
for i in range(k):
# 依据聚类获取对应数据,并显示
xx = [x[num] for num in classification_temp['cluster_' + str(i)]]
yy = [y[num] for num in classification_temp['cluster_' + str(i)]]
plt.xlim(0.1, 0.9)
plt.ylim(0, 0.9)
plt.grid()
plt.scatter(xx, yy, marker=markers[i])
plt.savefig("./images/gauss_cluster.png", format="png")
if __name__ == "__main__":
cluster_visiualization(3, 100)
算法本身不复杂,可能涉及到矩阵求导的部分会麻烦一点。西瓜数据集太小了,收敛非常快。然后,这个算法同样对于初值敏感。
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
# #############################################################################
# 产生样本数据
centers = [[1, 1], [-1, -1], [1, -1]] # 生成聚类中心点
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,random_state=0) # 生成样本数据集
X = StandardScaler().fit_transform(X) # StandardScaler作用:去均值和方差归一化。且是针对每一个特征维度来做的,而不是针对样本。
# #############################################################################
# 调用密度聚类 DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
# print(db.labels_) # db.labels_为所有样本的聚类索引,没有聚类索引为-1
# print(db.core_sample_indices_) # 所有核心样本的索引
core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # 设置一个样本个数长度的全false向量
core_samples_mask[db.core_sample_indices_] = True #将核心样本部分设置为true
labels = db.labels_
# 获取聚类个数。(聚类结果中-1表示没有聚类为离散点)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# 模型评估
print('估计的聚类个数为: %d' % n_clusters_)
print("同质性: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # 每个群集只包含单个类的成员。
print("完整性: %0.3f" % metrics.completeness_score(labels_true, labels)) # 给定类的所有成员都分配给同一个群集。
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # 同质性和完整性的调和平均
print("调整兰德指数: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print("调整互信息: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels))
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
# 使用黑色标注离散点
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1: # 聚类结果为-1的样本为离散点
# 使用黑色绘制离散点
col = [0, 0, 0, 1]
class_member_mask = (labels == k) # 将所有属于该聚类的样本位置置为true
xy = X[class_member_mask & core_samples_mask] # 将所有属于该类的核心样本取出,使用大图标绘制
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask] # 将所有属于该类的非核心样本取出,使用小图标绘制
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
DBSCAN的主要优点有:
1) 可以对任意形状的稠密数据集进行聚类,相对的,K-Means之类的聚类算法一般只适用于凸数据集。
2) 可以在聚类的同时发现异常点,对数据集中的异常点不敏感。
3) 聚类结果没有偏倚,相对的,K-Means之类的聚类算法初始值对聚类结果有很大影响。
DBSCAN的主要缺点有:
1)如果样本集的密度不均匀、聚类间距差相差很大时,聚类质量较差,这时用DBSCAN聚类一般不适合。
2)如果样本集较大时,聚类收敛时间较长,此时可以对搜索最近邻时建立的KD树或者球树进行规模限制来改进。
3) 调参相对于传统的K-Means之类的聚类算法稍复杂,主要需要对距离阈值ϵ,邻域样本数阈值MinPts联合调参,不同的参数组合对最后的聚类效果有较大影响。
#-*- coding:utf-8 -*-
import math
import pylab as pl
#数据集:每三个是一组分别是西瓜的编号,密度,含糖量
data = """
1,0.697,0.46,2,0.774,0.376,3,0.634,0.264,4,0.608,0.318,5,0.556,0.215,
6,0.403,0.237,7,0.481,0.149,8,0.437,0.211,9,0.666,0.091,10,0.243,0.267,
11,0.245,0.057,12,0.343,0.099,13,0.639,0.161,14,0.657,0.198,15,0.36,0.37,
16,0.593,0.042,17,0.719,0.103,18,0.359,0.188,19,0.339,0.241,20,0.282,0.257,
21,0.748,0.232,22,0.714,0.346,23,0.483,0.312,24,0.478,0.437,25,0.525,0.369,
26,0.751,0.489,27,0.532,0.472,28,0.473,0.376,29,0.725,0.445,30,0.446,0.459"""
#数据处理 dataset是30个样本(密度,含糖量)的列表
a = data.split(',')
dataset = [(float(a[i]), float(a[i+1])) for i in range(1, len(a)-1, 3)]
#计算欧几里得距离,a,b分别为两个元组
def dist(a, b):
return math.sqrt(math.pow(a[0]-b[0], 2)+math.pow(a[1]-b[1], 2))
#dist_min
def dist_min(Ci, Cj):
return min(dist(i, j) for i in Ci for j in Cj)
#dist_max
def dist_max(Ci, Cj):
return max(dist(i, j) for i in Ci for j in Cj)
#dist_avg
def dist_avg(Ci, Cj):
return sum(dist(i, j) for i in Ci for j in Cj)/(len(Ci)*len(Cj))
#找到距离最小的下标
def find_Min(M):
min = 1000
x = 0; y = 0
for i in range(len(M)):
for j in range(len(M[i])):
if i != j and M[i][j] < min:
min = M[i][j];x = i; y = j
return (x, y, min)
#算法模型:
def AGNES(dataset, dist, k):
#初始化C和M
C = [];M = []
for i in dataset:
Ci = []
Ci.append(i)
C.append(Ci)
for i in C:
Mi = []
for j in C:
Mi.append(dist(i, j))
M.append(Mi)
q = len(dataset)
#合并更新
while q > k:
x, y, min = find_Min(M)
C[x].extend(C[y])
C.remove(C[y])
M = []
for i in C:
Mi = []
for j in C:
Mi.append(dist(i, j))
M.append(Mi)
q -= 1
return C
#画图
def draw(C):
colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm']
for i in range(len(C)):
coo_X = [] #x坐标列表
coo_Y = [] #y坐标列表
for j in range(len(C[i])):
coo_X.append(C[i][j][0])
coo_Y.append(C[i][j][1])
pl.scatter(coo_X, coo_Y, marker='x', color=colValue[i%len(colValue)], label=i)
pl.legend(loc='upper right')
pl.show()
C = AGNES(dataset, dist_avg, 3)
draw(C)
AGNES算法比较简单,但一旦一组对象被合并,下一步的处理将在新生成的簇上进行。已做处理不能撤消,聚类之间也不能交换对象。增加新的样本对结果的影响较大。
假定在开始的时候有nn个簇,在结束的时候有11个簇,因此在主循环中有nn次迭代,在第ii次迭代中,我们必须在n−i+1n−i+1个簇中找到最靠近的两个进行合并。另外算法必须计算所有对象两两之间的距离,因此这个算法的复杂度为 O(n2)O(n2),该算法对于nn很大的情况是不适用的。