DBSCAN(Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法,python中的sklearn.cluster库可以实现DBSCAN聚类。
DBSCAN(eps=0.5,
min_samples=5,
metric='euclidean',
mtric_params=None,
algorithm='auto',
leaf_size=30,
p=None,
n_jobs=1)
参数介绍
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.cluster import DBSCAN
iris = datasets.load_iris()
X = iris.data[:, :4] # #表示我们只取特征空间中的4个维度
estimator = DBSCAN(eps=0.4,min_samples=9) # 构造聚类器
estimator.fit(X) # 聚类
label_pred2 = estimator.labels_ # 获取聚类标签
# 绘制结果
x0 = X[label_pred2 == 0]
x1 = X[label_pred2 == 1]
x2 = X[label_pred2 == 2]
plt.scatter(x0[:, 0], x0[:, 1], c="red", marker='o', label='label0')
plt.scatter(x1[:, 0], x1[:, 1], c="green", marker='*', label='label1')
plt.scatter(x2[:, 0], x2[:, 1], c="blue", marker='+', label='label2')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
plt.show()
DBSCAN()中有两个参数非常重要,即,邻域半径eps和核心点邻域内的最少样本数min_samples。
对于eps的选取我们可以利用k-distance碎石图来实现:
min_samples的取值为上述k值加1,即:min_samples=k + 1 。
import numpy as np
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
np.random.seed(2021)
data = np.ones([1005,2])
data[:1000] = make_moons(n_samples=1000,noise=0.05,random_state=2022)[0]
data[1000:] = [[-1,-0.5],
[-0.5,-1],
[-1,1.5],
[2.5,-0.5],
[2,1.5]]
print(data.shape)
plt.scatter(data[:,0],data[:,1],color="c")
plt.show()
def select_MinPts(data,k):
k_dist = []
for i in range(data.shape[0]):
dist = (((data[i] - data)**2).sum(axis=1)**0.5)
dist.sort()
k_dist.append(dist[k])
return np.array(k_dist)
k = 3 # 此处k取 2*2 -1
k_dist = select_MinPts(data,k)
k_dist.sort()
plt.plot(np.arange(k_dist.shape[0]),k_dist[::-1])
dbscan_model = DBSCAN(eps=0.1,min_samples=k+1)
label = dbscan_model.fit_predict(data)
class_1 = []
class_2 = []
noise = []
for index,value in enumerate(label):
if value == 0:
class_1.append(index)
elif value == 1:
class_2.append(index)
elif value == -1:
noise.append(index)
plt.scatter(data[class_1,0],data[class_1,1],color="g",label="class 1")
plt.scatter(data[class_2,0],data[class_2,1],color="b",label = "class 2")
plt.scatter(data[noise,0],data[noise,1],color="r",label = "noise")
plt.legend()
plt.show()