Clustering by fast search and find of density peaks论文密度峰值聚类算法学习与Python代码实现

Clustering by fast search and find of density peaks密度峰值聚类学习与其Python代码实现(马房山实验报告大学数据挖掘大作业)

  • 不想看我瞎扯可以直接跳到这
  • 算法分析
  • 代码
    • 说明

Clustering by fast search and find of density peaks(通过快速搜索和密度峰值来聚类)一文介绍了一种新的无监督聚类算法。

不想看我瞎扯可以直接跳到这

算法分析

参考:https://blog.csdn.net/itplus/article/details/38926837

该算法的核心思想:

  • 类簇中心的局部密度很高,并且由一些局部密度比较低的点围绕
  • 类簇中心距离其他有高局部密度的点的距离都比较大

代码

说明

代码参考:https://blog.csdn.net/kryolith/article/details/39832573

首先大家可以看出我的代码时间复杂度十分的高,还不如用C来实现的快,完全没有体现出Python的高雅;

然后我的代码与其说是聚类,倒不如说是分类,因为我指定了类的个数;

  • 不过论文中也提到了该聚类方法的聚类中心选择需要人工干预

以及我的代码里面有一个致命的bug,会导致聚类失败,如下图所示:

虽然我已经修复了这个bug,但是为了防止各位看都不看就cv,我还是把有bug的代码放了上来。

最后,我并没有排除噪声。

综上,我的代码重在理解。

import numpy as np
import matplotlib.pyplot as plt
import math 
from sklearn.datasets import make_blobs

class ScanClass:
	def __init__(self, position, density, distance):
		self.position = position
		self.density = density
		self.distance = distance
		self.belong = -1
		self.judge = density * distance
		
	def __eq__(self, other):
		return self.density == other.density
	
	def __le__(self, other):
		return self.density > other.density

	def __gt__(self, other):
		return self.density < other.density
 
 
def distanceNorm(Norm,D_value):		# 求两点间距离
	# initialization
 
	# Norm for distance
	if Norm == '1':										# 一维
		counter = np.absolute(D_value);		# 求绝对值
		counter = np.sum(counter);
	elif Norm == '2':									# 二维距离
		counter = np.power(D_value,2);		# 数组元素每个求二次方
		counter = np.sum(counter);	
		counter = np.sqrt(counter);
	elif Norm == 'Infinity':
		counter = np.absolute(D_value);
		counter = np.max(counter);
	else:
		raise Exception('None.');
 
	return counter;
 
 
def chi(x):
	if x < 0:
		return 1;
	else:
		return 0;
 
 
def fit(features,labels,t,distanceMethod = '2'):
	# 坐标  点个数  t  二维
	# 两点之间的距离矩阵
	distance = np.zeros((len(labels),len(labels)));
	# 所有距离列表
	distance_sort = list();
	# 每个点的局部密度
	density = np.zeros(len(labels));
	# 
	distance_higherDensity = np.zeros(len(labels));
 
 
	# 计算每个点到其它点的距离
	for index_i in range(len(labels)):
		for index_j in range(index_i+1,len(labels)):
			D_value = features[index_i] - features[index_j];
			distance[index_i,index_j] = distanceNorm(distanceMethod,D_value);
			distance_sort.append(distance[index_i,index_j]);
	distance += distance.T;			# 距离矩阵加上自身转置
 
	# compute optimal cutoff		计算最优截断距离
	# dc可以选择为平均每个点的邻居局部密度为数据总数的1-2%  t应该就是这个百分比
	distance_sort = np.array(distance_sort);
	cutoff = int(np.round(distance_sort[int(len(distance_sort) * t)]));
 	
	dict1 = {}
	# computer density		计算高斯局部密度
	for i in range(len(labels) - 1):
		for j in range(i + 1, len(labels)):
			density[i] = density[i] + np.exp(-(distance[i, j] / cutoff) * (distance[i, j] / cutoff))
			density[j] = density[j] + np.exp(-(distance[i, j] / cutoff) * (distance[i, j] / cutoff))
	density = [item / max(density) for item in density]

	# 寻找密度最大值
	Max = np.max(density);
	MaxIndexList = list();
	for index_i in range(len(labels)):
		if density[index_i] == Max:
			MaxIndexList.extend([index_i]);		# 列表末追加
	print(len(MaxIndexList))
	
	# computer distance_higherDensity
	"""
	对于非局部密度最大点,计算距离δi实际上分两步 
	- 找到所有局部密度比i点高的点; 
	- 在这些点中找到距离i点最近的那个点j,i和j的距离就是δi的值。

	对于局部密度最大点,δi实际上是该点和其他所有点距离值的最大值。
	"""
	list0 = []
	Min = 0;
	for index_i in range(len(labels)):
		if index_i in MaxIndexList:
			distance_higherDensity[index_i] = np.max(distance[index_i]);	# 保存密度中心到最远点的距离
			list0.append(ScanClass((features[index_i][0], features[index_i][1]), density[index_i], distance_higherDensity[i]))
			continue;
		else:
			Min = np.max(distance[index_i]);
		for index_j in range(1,len(labels)):
			if density[index_i] < density[index_j] and distance[index_i,index_j] < Min:
				Min = distance[index_i,index_j];
			else:
				continue;
		distance_higherDensity[index_i] = Min;
		list0.append(ScanClass((features[index_i][0], features[index_i][1]), density[index_i], distance_higherDensity[i]))
		
	sorted_core = sorted(list0, key=lambda a: -a.judge, reverse = True)

	list0 = sorted(list0)
	
	"""
	for index_i in range(len(labels)):
		print((list0[index_i].density, list0[index_i].distance_higher))
	"""
	sorted_core[0].belong = 0
	sorted_core[1].belong = 1
	list1 = [sorted_core[0]]
	list2 = [sorted_core[1]]

	for index_i in range(2, len(labels)):
		Min1 = math.hypot(list0[index_i].position[0] - list1[0].position[0], list0[index_i].position[1] - list1[0].position[1])
		Min2 = math.hypot(list0[index_i].position[0] - list2[0].position[0], list0[index_i].position[1] - list2[0].position[1])
		for i in list1:
			dis = math.hypot(list0[index_i].position[0] - i.position[0], list0[index_i].position[1] - i.position[1])
			if dis < Min1:
				Min1 = dis
				
		for i in list2:
			dis = math.hypot(list0[index_i].position[0] - i.position[0], list0[index_i].position[1] - i.position[1])
			if dis < Min2:
				Min2 = dis

		if Min1 < Min2:
			list0[index_i].belong = 1
			list1.append(list0[index_i])
		if Min1 > Min2:
			list0[index_i].belong = 2
			list2.append(list0[index_i])
			
 	
	return list1, list2, cutoff;
	

lablesNum = 100
"""
n_samples:表示数据样本点个数,默认值100
n_features:表示数据的维度,默认值是2
centers:产生数据的中心点,默认值3
cluster_std:数据集的标准差,浮点数或者浮点数序列,默认值1.0
center_box:中心确定之后的数据边界,默认值(-10.0, 10.0)
shuffle :洗乱,默认值是True
random_state:官网解释是随机生成器的种子
"""
data, lable = make_blobs(n_features=2, n_samples=100, centers=2, random_state=3,cluster_std=[0.5, 0.7])

list1, list2, cutoff = fit(data, lable, 0.02)

x_values = [x[0] for x in data]
y_values = [x[1] for x in data]
plt.scatter(x_values, y_values)
plt.show()

x_values1 = [x.position[0] for x in list1]
y_values1 = [x.position[1] for x in list1]
plt.scatter(x_values1, y_values1)

x_values2 = [x.position[0] for x in list2]
y_values2 = [x.position[1] for x in list2]
plt.scatter(x_values2, y_values2)
plt.show()

正常运行效果:

我见过凌晨四点的武汉(乐

睡觉去了。明天第一节课翘了吧。

你可能感兴趣的:(造福后人,数据挖掘,聚类,python)