# -*- coding: UTF-8 -*-
"""
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@File : dbscan.py
@Contact : [email protected]
@License : (C)Copyright 2017-2019
@Author : ffzzyy
@Version : 0.1
@Modify Time : 2019/3/28 22:10
@Desciption
dbscan 算法实现
"""
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.cluster as skc
import numpy as np
import math
import seaborn as sns
def dist(a, b):
"""
输入:向量A, 向量B
输出:两个向量的欧式距离
"""
return math.sqrt(np.power(a - b, 2).sum())
class dbscan():
"""dbscan 类
Parameters
----------
eps,min_samples
Attributes
----------
core_object_index:list,核心对象序列,对应训练集索引
_train_set:ndarray,训练集
labels:ndarray,对应训练集索引的,聚类 id
_k:int,聚类簇id,从 0 开始
"""
def __init__(self, eps, min_samples):
self.eps = eps
self.min_samples = min_samples
self.core_object_index = []
self._train_set = None
self.labels = []
self._k = -1
self.components = []
def fit(self, train_set):
self._train_set = train_set
self.init_core_objects()
self.labels = np.array([-1] * len(train_set))
self._k = -1
unvisited_object_index = range(len(train_set))
unvisited_core_index = self.core_object_index.copy() # 深度复制
while (len(unvisited_core_index) != 0):
"""
从 未被访问的核心对象中 随机选择一个 queue 序列中,queue
来保存 通过第一个核心对象,一个接一个找出其密度可达的 核心对象
"""
random_index = np.random.choice(unvisited_core_index)
queue = [random_index]
"""生成,聚类id"""
self._k = self._k + 1
# np.delete(unvisited_core_object_index)
while (len(queue) != 0):
queue_index_ = queue.pop(0)
"""
找出在队列中的 核心对象的,eps范围内的 样本(注意是未被聚类的样本),注意这个时候不需要再比较
minpts了,因为都已经是核心对象了,肯定是大于minpts了
注意,eps范围内的样本是包含了自身的
"""
points_index = self.region_query(queue_index_, unvisited_object_index)
if len(points_index) > 0:
# 通过numpy的花式索引,将eps 范围内的未被分类的样本,全服赋值 聚类id K
self.labels[points_index] = self._k
# 将points_index 序列 从未被访问的样本序列中除去,注意这种技巧
unvisited_object_index = list(set(unvisited_object_index) - set(points_index))
# 找到eps范围的样本中,是核心对象的,这些需要放到queue队列中,继续迭代
intersect_core_index = list(set(unvisited_core_index) & set(points_index))
"""列表之间的合并需使用extend,不能使用append"""
queue.extend(intersect_core_index)
# 找到eps范围的样本中,是核心对象的,将其从未被访问的核心对象列表中,去掉
unvisited_core_index = list(set(unvisited_core_index) - set(intersect_core_index))
def init_core_objects(self):
"""
得到dbscan的核心对象列表
"""
for i in range(self._train_set.shape[0]):
seeds = self.region_query(i)
# 注意是包含了自己的
if len(seeds) >= self.min_samples:
self.core_object_index.append(i)
def eps_neighbor(self, a, b):
"""
输入:向量A, 向量B
输出:是否在eps范围内
"""
return dist(a, b) < self.eps
def region_query(self, point_id, region_index=None):
"""
输入:数据集, 查询点id, 半径大小
输出:在eps范围内的点的id列表
"""
seeds = []
if region_index is None:
nPoints = self._train_set.shape[0]
for i in range(nPoints):
if self.eps_neighbor(self._train_set[point_id], self._train_set[i]):
seeds.append(i)
else:
for i in region_index:
if self.eps_neighbor(self._train_set[point_id], self._train_set[i]):
seeds.append(i)
return seeds
def load_watermelon_set(file_path):
"""
载入西瓜4.0数据集
"""
df = pd.read_csv(file_path, encoding='cp936')
return df
def show_watermelon_scatter(X):
# plt.figure()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.scatter(x=X[:, 0], y=X[:, 1])
plt.title("西瓜数据集分布图")
plt.xlabel("密度")
plt.ylabel("含糖率")
plt.show() # 显示图像
plt.close()
def show_subplot(ax, X, labels, core_object_index, title=''):
# colors = plt.cm.rainbow(np.linspace(0, 1, len(set(labels))))
# colors=['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # 获取分簇的数目
# 使用seaborn的颜色,来标注不同聚类
current_palette = sns.color_palette("deep", n_colors=n_clusters_)
colors = sns.color_palette(current_palette).as_hex()
for i, color in zip(range(n_clusters_), colors):
one_cluster = X[labels == i]
ax.scatter(x=one_cluster[:, 0], y=one_cluster[:, 1], s=50, c=color) # c:颜色参数是一个二维的数组
"""标注离群点"""
one_cluster = X[labels == -1]
ax.scatter(x=one_cluster[:, 0], y=one_cluster[:, 1], s=50, marker="X", c="brown", label="离群点") # c:颜色参数是一个二维的数组
"""为每个离散点,增加标注,例如:x1 x2等等"""
for i in range(len(X)):
ax.annotate(s="X{}".format(i + 1),
xy=(X[i][0], X[i][1]),
xytext=(-3, 3),
textcoords="offset points")
ax.set_title(title)
ax.set_xlabel("密度")
ax.set_ylabel("含糖率")
"""定义X Y坐标范围"""
ax.set_xlim(0, X[:, 0].max() * 1.1)
ax.set_ylim(0, X[:, 1].max() * 1.1)
"""对核心对象画圆"""
# one_cluster = X[[2, 4, 5, 7, 8, 12, 13, 17, 18, 23, 24, 27, 28]]
# for i in one_cluster:
# circle(ax,i[0],i[1],0.11)
"""标注核心对象"""
one_cluster = X[core_object_index]
ax.scatter(x=one_cluster[:, 0], y=one_cluster[:, 1], s=200, c="", edgecolors='r', label="核心对象")
ax.legend()
def circle(ax, x, y, r, color='k', count=100):
"""
在plot上画圆,效果不是特别好
"""
xarr = []
yarr = []
for i in range(count):
j = float(i) / count * 2 * np.pi
xarr.append(x + r * np.cos(j))
yarr.append(y + r * np.sin(j))
ax.plot(xarr, yarr, c=color, linewidth=0.3)
def main():
"""
train_set:ndarray,训练集
labels:ndarray,对应训练集索引的,聚类 id
core_object_index:list:核心对象,对应训练集索引
:return:
"""
watermelon_set_file_path = "西瓜4.0.csv"
df = load_watermelon_set(watermelon_set_file_path)
print(df)
train_set = np.array(df)
db = dbscan(eps=0.11, min_samples=5)
db.fit(train_set)
print("核心对象为:")
print(db.core_object_index)
print("样本个数为{0},分簇为:".format(len(db.labels)))
print(db.labels)
print("分簇个数为:", db._k + 1)
# 画两个图,和mklearn聚类的效果进行对比
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (8, 6)
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax2 = fig.add_subplot(2, 1, 2)
show_subplot(ax1, db._train_set, db.labels, db.core_object_index, "自己编码dbscan结果")
# 调用mklearn
db = skc.DBSCAN(eps=0.11, min_samples=5).fit(train_set) # DBSCAN聚类方法 还有参数,matric = ""距离计算方法
show_subplot(ax2, train_set, db.labels_, db.core_sample_indices_, "调用mklearn聚类结果")
plt.show()
plt.close()
if __name__ == '__main__':
main()