Clustering by fast search and find of density peaks(通过快速搜索和密度峰值来聚类)一文介绍了一种新的无监督聚类算法。
参考:https://blog.csdn.net/itplus/article/details/38926837
该算法的核心思想:
- 类簇中心的局部密度很高,并且由一些局部密度比较低的点围绕
- 类簇中心距离其他有高局部密度的点的距离都比较大
代码参考:https://blog.csdn.net/kryolith/article/details/39832573
首先大家可以看出我的代码时间复杂度十分的高,还不如用C来实现的快,完全没有体现出Python的高雅;
然后我的代码与其说是聚类,倒不如说是分类,因为我指定了类的个数;
- 不过论文中也提到了该聚类方法的聚类中心选择需要人工干预
以及我的代码里面有一个致命的bug,会导致聚类失败,如下图所示:
虽然我已经修复了这个bug,但是为了防止各位看都不看就cv,我还是把有bug的代码放了上来。
最后,我并没有排除噪声。
综上,我的代码重在理解。
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.datasets import make_blobs
class ScanClass:
def __init__(self, position, density, distance):
self.position = position
self.density = density
self.distance = distance
self.belong = -1
self.judge = density * distance
def __eq__(self, other):
return self.density == other.density
def __le__(self, other):
return self.density > other.density
def __gt__(self, other):
return self.density < other.density
def distanceNorm(Norm,D_value): # 求两点间距离
# initialization
# Norm for distance
if Norm == '1': # 一维
counter = np.absolute(D_value); # 求绝对值
counter = np.sum(counter);
elif Norm == '2': # 二维距离
counter = np.power(D_value,2); # 数组元素每个求二次方
counter = np.sum(counter);
counter = np.sqrt(counter);
elif Norm == 'Infinity':
counter = np.absolute(D_value);
counter = np.max(counter);
else:
raise Exception('None.');
return counter;
def chi(x):
if x < 0:
return 1;
else:
return 0;
def fit(features,labels,t,distanceMethod = '2'):
# 坐标 点个数 t 二维
# 两点之间的距离矩阵
distance = np.zeros((len(labels),len(labels)));
# 所有距离列表
distance_sort = list();
# 每个点的局部密度
density = np.zeros(len(labels));
#
distance_higherDensity = np.zeros(len(labels));
# 计算每个点到其它点的距离
for index_i in range(len(labels)):
for index_j in range(index_i+1,len(labels)):
D_value = features[index_i] - features[index_j];
distance[index_i,index_j] = distanceNorm(distanceMethod,D_value);
distance_sort.append(distance[index_i,index_j]);
distance += distance.T; # 距离矩阵加上自身转置
# compute optimal cutoff 计算最优截断距离
# dc可以选择为平均每个点的邻居局部密度为数据总数的1-2% t应该就是这个百分比
distance_sort = np.array(distance_sort);
cutoff = int(np.round(distance_sort[int(len(distance_sort) * t)]));
dict1 = {}
# computer density 计算高斯局部密度
for i in range(len(labels) - 1):
for j in range(i + 1, len(labels)):
density[i] = density[i] + np.exp(-(distance[i, j] / cutoff) * (distance[i, j] / cutoff))
density[j] = density[j] + np.exp(-(distance[i, j] / cutoff) * (distance[i, j] / cutoff))
density = [item / max(density) for item in density]
# 寻找密度最大值
Max = np.max(density);
MaxIndexList = list();
for index_i in range(len(labels)):
if density[index_i] == Max:
MaxIndexList.extend([index_i]); # 列表末追加
print(len(MaxIndexList))
# computer distance_higherDensity
"""
对于非局部密度最大点,计算距离δi实际上分两步
- 找到所有局部密度比i点高的点;
- 在这些点中找到距离i点最近的那个点j,i和j的距离就是δi的值。
对于局部密度最大点,δi实际上是该点和其他所有点距离值的最大值。
"""
list0 = []
Min = 0;
for index_i in range(len(labels)):
if index_i in MaxIndexList:
distance_higherDensity[index_i] = np.max(distance[index_i]); # 保存密度中心到最远点的距离
list0.append(ScanClass((features[index_i][0], features[index_i][1]), density[index_i], distance_higherDensity[i]))
continue;
else:
Min = np.max(distance[index_i]);
for index_j in range(1,len(labels)):
if density[index_i] < density[index_j] and distance[index_i,index_j] < Min:
Min = distance[index_i,index_j];
else:
continue;
distance_higherDensity[index_i] = Min;
list0.append(ScanClass((features[index_i][0], features[index_i][1]), density[index_i], distance_higherDensity[i]))
sorted_core = sorted(list0, key=lambda a: -a.judge, reverse = True)
list0 = sorted(list0)
"""
for index_i in range(len(labels)):
print((list0[index_i].density, list0[index_i].distance_higher))
"""
sorted_core[0].belong = 0
sorted_core[1].belong = 1
list1 = [sorted_core[0]]
list2 = [sorted_core[1]]
for index_i in range(2, len(labels)):
Min1 = math.hypot(list0[index_i].position[0] - list1[0].position[0], list0[index_i].position[1] - list1[0].position[1])
Min2 = math.hypot(list0[index_i].position[0] - list2[0].position[0], list0[index_i].position[1] - list2[0].position[1])
for i in list1:
dis = math.hypot(list0[index_i].position[0] - i.position[0], list0[index_i].position[1] - i.position[1])
if dis < Min1:
Min1 = dis
for i in list2:
dis = math.hypot(list0[index_i].position[0] - i.position[0], list0[index_i].position[1] - i.position[1])
if dis < Min2:
Min2 = dis
if Min1 < Min2:
list0[index_i].belong = 1
list1.append(list0[index_i])
if Min1 > Min2:
list0[index_i].belong = 2
list2.append(list0[index_i])
return list1, list2, cutoff;
lablesNum = 100
"""
n_samples:表示数据样本点个数,默认值100
n_features:表示数据的维度,默认值是2
centers:产生数据的中心点,默认值3
cluster_std:数据集的标准差,浮点数或者浮点数序列,默认值1.0
center_box:中心确定之后的数据边界,默认值(-10.0, 10.0)
shuffle :洗乱,默认值是True
random_state:官网解释是随机生成器的种子
"""
data, lable = make_blobs(n_features=2, n_samples=100, centers=2, random_state=3,cluster_std=[0.5, 0.7])
list1, list2, cutoff = fit(data, lable, 0.02)
x_values = [x[0] for x in data]
y_values = [x[1] for x in data]
plt.scatter(x_values, y_values)
plt.show()
x_values1 = [x.position[0] for x in list1]
y_values1 = [x.position[1] for x in list1]
plt.scatter(x_values1, y_values1)
x_values2 = [x.position[0] for x in list2]
y_values2 = [x.position[1] for x in list2]
plt.scatter(x_values2, y_values2)
plt.show()
正常运行效果:
我见过凌晨四点的武汉(乐
睡觉去了。明天第一节课翘了吧。