csv文件链接: 百度网盘
链接:https://pan.baidu.com/s/1NJmdbm5-3wh6EQUiHEOByA
提取码:6666
# 记得第一行要加类别标签
密度,含糖率
0.697,0.46
0.774,0.376
0.634,0.264
0.608,0.318
0.556,0.215
0.403,0.237
0.481,0.149
0.437,0.211
0.666,0.091
0.243,0.267
0.245,0.057
0.343,0.099
0.639,0.161
0.657,0.198
0.36,0.37
0.593,0.042
0.719,0.103
0.359,0.188
0.339,0.241
0.282,0.257
0.748,0.232
0.714,0.346
0.483,0.312
0.478,0.437
0.525,0.369
0.751,0.489
0.532,0.472
0.473,0.376
0.725,0.445
0.446,0.459
代码实现:
# ssy_4_DBSCANimport math
import numpy as np
import pandas as pd
import pylab as pl
xigua = pd.read_csv('xigua4.csv')
dataset=[(i[0],i[1]) for i in xigua.values]
#计算欧几里得距离,a,b分别为两个元组
def dist(a, b):
return math.sqrt(math.pow(a[0]-b[0], 2)+math.pow(a[1]-b[1], 2))
#算法模型
def DBSCAN(D, e, Minpts):
#初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P,
T = set()
k = 0
C = []
P = set(D)
for d in D:
if len([ i for i in D if dist(d, i) <= e]) >= Minpts:
T.add(d)
#开始聚类
#所有的核心的 有多个类
while len(T):
P_old = P
#选取一个核心点
o = list(T)[np.random.randint(0, len(T))]
#把核心点从未选取中取出
P = P - set(o)
Q = []
Q.append(o)
while len(Q):
q = Q[0]
#核心点周围的点
Nq = [i for i in D if dist(q, i) <= e]
#核心点和周围的点都记录下来
if len(Nq) >= Minpts:
S = P & set(Nq)
Q += (list(S))
P = P - S
Q.remove(q)
k += 1
Ck = list(P_old - P)
T = T - set(Ck)
C.append(Ck)
return C
#画图 shanyou_4_DBSCAN
def draw(C):
colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm']
for i in range(len(C)):
coo_X = [] #x坐标列表
coo_Y = [] #y坐标列表
for j in range(len(C[i])):
coo_X.append(C[i][j][0])
coo_Y.append(C[i][j][1])
pl.scatter(coo_X, coo_Y, marker='o', color=colValue[i%len(colValue)], label=i+1)
pl.legend(loc='upper left')
pl.show()
C = DBSCAN(dataset, 0.11, 5)
draw(C)