基于密度聚类算法OPTICS最关键的是得到样本点聚类顺序图,相比于另一个密度聚类算法DBSCAN,OPTICS对输入的参数eps和MinPts不敏感。参数eps和MinPts分别代表半径和最小样本数,定义半径eps范围内邻居点数量大于等于MinPts的样本点为核心点。本文默认eps为正无穷,且MinPts在总样本数之内,则任何一个样本点都可充当核心点,下面基于此情形采用Python实现OPTICS的密度聚类顺序图。
# 导入所需包
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
class Object:
def __init__(self, x, y):
self.x = x
self.y = y
self.Processed = False
self.reachability_distance = 'UNDEFINED'
self.core_distance = 'UNDEFINED'
def dist(self, obj):
return sqrt((self.x - obj.x) ** 2 + (self.y - obj.y) ** 2)
def setCoreDistance(self, MinPts, SetOfObjects):
DisList = []
for i in SetOfObjects:
DisList.append(self.dist(i))
DisList.sort()
return DisList[MinPts-1]
class OPTICS:
def __init__(self, X, MinPts):
self.X = X
self.MinPts = MinPts
self.SetOfObjects = [] # 存放原始数据集对象
self.OrderedFile = [] # 存放排序结果
self.OrderSeeds = [] # 存放邻居对象
# 初始化原始数据对象
for i in range(self.X.shape[0]):
currentObject = Object(self.X[i,0], self.X[i,1])
self.SetOfObjects.append(currentObject)
for obj in self.SetOfObjects:
if not obj.Processed:
self.ExpandClusterOrder(obj, self.MinPts)
def OrderSeedsUpdate(self, CenterObject):
c_dist = CenterObject.core_distance
for obj in self.SetOfObjects:
if not obj.Processed:
new_r_dist = max(c_dist, CenterObject.dist(obj))
if obj.reachability_distance == 'UNDEFINED':
obj.reachability_distance = new_r_dist
self.OrderSeeds.append(obj)
elif new_r_dist < obj.reachability_distance:
obj.reachability_distance = new_r_dist
self.OrderSeeds.sort(key=lambda x: x.reachability_distance)
def ExpandClusterOrder(self, obj, MinPts):
obj.Processed = True
obj.core_distance = obj.setCoreDistance(MinPts, self.SetOfObjects)
self.OrderedFile.append(obj)
self.OrderSeedsUpdate(obj)
while len(self.OrderSeeds)>0:
currentObject = self.OrderSeeds[0]
del self.OrderSeeds[0]
currentObject.Processed = True
currentObject.core_distance = currentObject.setCoreDistance(MinPts, self.SetOfObjects)
self.OrderedFile.append(currentObject)
self.OrderSeedsUpdate(currentObject)
# 测试样本
np.random.seed(0)
n_points_per_cluster = 250
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
# OPTICS密度聚类
OrderedFile = OPTICS(X, 20).OrderedFile
# 可达距离图
r_dis_list = []
for i in OrderedFile:
r_dis_list.append(i.reachability_distance)
plt.figure(figsize=(15, 5))
plt.scatter(list(range(len(r_dis_list[1:]))), r_dis_list[1:]);