代码
- 算法理论参考:【机器学习】密度聚类算法之OPTICS
import numpy as np
import matplotlib.pyplot as plt
import operator
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
def plotReachability(data,eps):
plt.figure()
plt.plot(range(0,len(data)), data)
plt.plot([0, len(data)], [eps, eps])
plt.show()
def plotFeature(data,labels):
clusterNum = len(set(labels))
fig = plt.figure()
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
ax = fig.add_subplot(111)
for i in range(-1, clusterNum):
colorSytle = scatterColors[i % len(scatterColors)]
subCluster = data[np.where(labels == i)]
ax.scatter(subCluster[:, 0], subCluster[:, 1], c=colorSytle, s=12)
plt.show()
class OPTICS1(object):
def __init__(self,data,eps=np.inf,minPts=15):
self.data=data
self.disMat = self.compute_squared_EDM(data)
self.number_sample=data.shape[0]
self.eps=eps
self.minPts=minPts
self.core_distances = self.disMat[np.arange(0, self.number_sample), np.argsort(self.disMat)[:, minPts - 1]]
self.core_points_index = np.where(np.sum(np.where(self.disMat <= self.eps, 1, 0), axis=1) >= self.minPts)[0]
def compute_squared_EDM(self,X):
return squareform(pdist(X, metric='euclidean'))
def train(self):
self.reach_dists = np.full((self.number_sample,), np.nan)
self.orders=[]
start_core_point=self.core_points_index[0]
isProcess = np.full((self.number_sample,), -1)
isProcess[start_core_point] = 1
self.reach_dists[start_core_point] = self.core_distances[start_core_point]
self.orders.append(start_core_point)
seeds = {}
seeds = self.updateSeeds(seeds, start_core_point, isProcess)
while len(seeds) > 0:
nextId = sorted(seeds.items(), key=operator.itemgetter(1))[0][0]
del seeds[nextId]
isProcess[nextId] = 1
self.orders.append(nextId)
seeds = self.updateSeeds(seeds, nextId, isProcess)
def updateSeeds(self,seeds, core_PointId, isProcess):
core_dist = self.core_distances[core_PointId]
for i in range(self.number_sample):
if (isProcess[i] == -1):
new_reach_dist = max(core_dist, self.disMat[core_PointId][i])
if (np.isnan(self.reach_dists[i])):
self.reach_dists[i] = new_reach_dist
seeds[i] = new_reach_dist
elif (new_reach_dist < self.reach_dists[i]):
self.reach_dists[i] = new_reach_dist
seeds[i] = new_reach_dist
return seeds
def predict(self):
clusterId = 0
self.labels = np.full((self.number_sample,), -1)
for i in self.orders:
if self.reach_dists[i]<=self.eps:
self.labels[i]=clusterId
else:
if self.core_distances[i]<=self.eps:
clusterId +=1
self.labels[i] = clusterId
if __name__ == '__main__':
data = np.loadtxt("data/cluster.csv", delimiter=",")
OP=OPTICS1(data,3,30)
OP.train()
OP.predict()
plotReachability(OP.reach_dists[OP.orders],3)
plotFeature(data,OP.labels)
结果图
参考链接
- https://blog.csdn.net/LoveCarpenter/article/details/85049135#32_70
- 但是该链接的代码似乎有问题,上面我进行了改写。