最近一段时间一直在忙论文,好在把论文大框架已经确定,后续就是细节上的修改以及创新。在下面的一段时间我会不定期的分享一下我论文中尝试过的各个方法的使用感。本篇重点介绍一下点云的聚类方法,我尝试了DBSCAN 聚类、区域增长聚类以及K-means聚类这三类聚类方法,下面依次进行总结。
1.DBSCAN---以点的密度大小进行聚类
初始设定两个值:minPts以及半径r
核心对象:若一个点的r邻域内点的个数大于等于minPts,我们就称该点为一个核心对象
直接密度可达:若某点p在核心点q的邻域内,则称p-q直接可达
密度可达:若有一个点序列:q0,q1,q2,…,qk,对序列里任意两个相邻的点都是直接可达的,则称从q0到qk密度可达
def vector_distance_v2(v):
"""
把单个向量内部的每个元素两两相减,得到一个差值矩阵,矩阵是上三角和下三角刚好相反的结果
:param v: 可以是一个一维数组,或者一个一维的列表
:return:
"""
if type(v) is list:
v = np.array(v)
# result = []
# for i in range(len(v)):
# result.append(v[i] - v) # 可以改为列表推导式
result = [v[i] - v for i in range(len(v))]
return np.vstack(result)
def point_distance(points):
"""
计算所有 points 两两之间的距离
:param points: 地面分割之后检测出来的点 n * 4
:return: n * n 的距离矩阵
"""
d2 = vector_distance_v2(points[:,0])**2 + \
vector_distance_v2(points[:,1])**2 + \
vector_distance_v2(points[:,2])**2
return np.sqrt(d2)
# @profile
def DBSCAN_points(points, eps=2., Minpts=15):
"""
基于密度的点云聚类
:param d_bbox: 点与点之间的距离矩阵
:param eps: 最大搜索直径阈值
:param Minpts: 最小包含其他对象数量阈值
:return: 返回聚类结果,是一个嵌套列表,每个子列表就是这个区域的对象的序号
"""
# 先求距离
print('DBSCAN clustering:',points.shape)
d_bbox = point_distance(points)
#初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P,
T = set()
k = 0
C = []
P = set(range(d_bbox.shape[0]))
# print('P',P)
for d in range(d_bbox.shape[0]):
# print(np.sum( d_bbox[d,:] <= eps))
if np.sum( d_bbox[d,:] <= eps) >= Minpts:
T.add(d) # 最初的核心对象
print('Len T: ',len(T))
#开始聚类
while len(T):
P_old = P #
o = list(T)[np.random.randint(0, len(T))] # 从T中随机选取一个核心元素
# o = list(T)[random.randint(0, len(T)-1)] # 从T中随机选取一个核心元素
# print('o: ',o)
P = P - set([o])
Q = []
Q.append(o)
# print('Q: ',Q)
while len(Q):
q = Q[0]
# print('q: ', q)
# Nq = [i for i in range(d_bbox.shape[0]) if d_bbox[q,i] <= eps] #q的领域密度
Nq = np.where(d_bbox[q,:] <= eps)[0]
if len(Nq) >= Minpts:
S = P & set(Nq) # 这个核心对象的密度可达对象与未访问对象的交集
Q += (list(S)) # 把这个核心对象以及它的密度可达对象都包含进来,对所有的对象再做多次密度可达检测
P = P - S # 未访问集合P 减去 这个核心对象的密度可达对象
# print('S: ', S)
# print('Nq: ', Nq)
# print('P: ', P)
Q.remove(q) # q 已经做过密度可达检测了,去掉它
# print('------')
k += 1
Ck = P_old - P # 原有的P和去掉了该核心对象的密度可达对象的P就是该类的所有对象
T = T - Ck # 去掉该类对象里面包含的核心对象
C.append(Ck) # 把该类的对象加入列表
# print('noise points:', P) # 最后没有被归类的数据点就是噪音点
return C
看上面的C++代码我们可以看到需要设定多个参数,通常需要进行多次迭代才能达到较好的效果,在这提供一种自适应参数的DBSCAN方法:
import math
import copy
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
def loadDataSet(fileName, splitChar='\t'):
"""
输入:文件名
输出:数据集
描述:从文件读入数据集
"""
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
def dist(a,b):
"""
用来计算两个样本点之间的距离
:param a: 样本点
:param b: 样本点
:return: 两个样本点之间的距离
"""
return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))
def returnDk(matrix,k):
"""
用来计算第K最近的距离集合
:param matrix: 距离矩阵
:param k: 第k最近
:return: 第k最近距离集合
"""
Dk = []
for i in range(len(matrix)):
Dk.append(matrix[i][k])
return Dk
def returnDkAverage(Dk):
"""
求第K最近距离集合的平均值
:param Dk: k-最近距离集合
:return: Dk的平均值
"""
sum = 0
for i in range(len(Dk)):
sum = sum + Dk[i]
return sum/len(Dk)
def CalculateDistMatrix(dataset):
"""
计算距离矩阵
:param dataset: 数据集
:return: 距离矩阵
"""
DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
for i in range(len(dataset)):
for j in range(len(dataset)):
DistMatrix[i][j] = dist(dataset[i], dataset[j])
return DistMatrix
def returnEpsCandidate(dataSet):
"""
计算Eps候选列表
:param dataSet: 数据集
:return: eps候选集合
"""
DistMatrix = CalculateDistMatrix(dataSet)
tmp_matrix = copy.deepcopy(DistMatrix)
for i in range(len(tmp_matrix)):
tmp_matrix[i].sort()
EpsCandidate = []
for k in range(1,len(dataSet)):
Dk = returnDk(tmp_matrix,k)
DkAverage = returnDkAverage(Dk)
EpsCandidate.append(DkAverage)
return EpsCandidate
def returnMinptsCandidate(DistMatrix,EpsCandidate):
"""
计算Minpts候选列表
:param DistMatrix: 距离矩阵
:param EpsCandidate: Eps候选列表
:return: Minpts候选列表
"""
MinptsCandidate = []
for k in range(len(EpsCandidate)):
tmp_eps = EpsCandidate[k]
tmp_count = 0
for i in range(len(DistMatrix)):
for j in range(len(DistMatrix[i])):
if DistMatrix[i][j] <= tmp_eps:
tmp_count = tmp_count + 1
MinptsCandidate.append(tmp_count/len(dataSet))
return MinptsCandidate
def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):
"""
计算聚类后的类别数目
:param dataset: 数据集
:param EpsCandidate: Eps候选列表
:param MinptsCandidate: Minpts候选列表
:return: 聚类数量列表
"""
np_dataset = np.array(dataset) #将dataset转换成numpy_array的形式
ClusterNumberList = []
for i in range(len(EpsCandidate)):
clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
num_clustering = max(clustering.labels_)
ClusterNumberList.append(num_clustering)
return ClusterNumberList
if __name__ == '__main__':
dataSet = loadDataSet('', splitChar=',')
EpsCandidate = returnEpsCandidate(dataSet)
DistMatrix = CalculateDistMatrix(dataSet)
MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
print(EpsCandidate)
print(MinptsCandidate)
print('cluster number list is')
print(ClusterNumberList)
predict_dict.items()
file = open('DBSCAN3聚类结果.txt', 'w')
for i in range(len(ClusterNumberList)):
s = str(ClusterNumberList[i]).replace('[', '').replace(']', '').replace("'", '').replace(':', ',') + '\n'
file.write(s)
file.close()
2.PCL区域增长法---见上篇文章
3.K-means--非监督聚类
import csv
import numpy as np
def kmean(x,k,maxtimes):
m,n = np.shape(x)
# 建立一个比数据集多一列的零矩阵,多的一列用来存放标签
dataset = np.zeros([m,n+1])
dataset[:,:-1] = x
#根据要聚类的数量,初始化相应数量的中心点,可以随机选择n个,也可以选前n个作为初始点
#middle = dataset[np.random.randint(m,size=k),:]
middle = dataset[0:4,:]###kkkkkkkkkkkkkkkkkkkk
#为选定的中心点赋予标签
middle[:,-1] = range(1,k+1)
times = 0
oldmiddle = None
#迭代更新中心点时,判断何时停止
while not shouldstop(oldmiddle,middle,times,maxtimes):
print('times:',times)
print('dataset:',dataset)
print('middle:',middle)
oldmiddle = np.copy(middle)
times = times + 1
#根据中心点,更新其他各个点的标签
update(dataset,middle)
#获取新的中心点
middle = getmiddles(dataset,k)
return dataset
def shouldstop(oldmiddle,middle,times,maxtimes):
if times > maxtimes:
return True
return np.array_equal(oldmiddle,middle)
def update(dataset,middle):
m,n =dataset.shape
for i in range(0,m):
dataset[i,-1] = getLabelFromCloestCentroid(dataset[i,:-1],middle)
#找出各个点距离最近的中心点,将中心点的标签赋予当前点
def getLabelFromCloestCentroid(datasetRow,middle):
label = middle[0,-1]
minDist = np.linalg.norm(datasetRow - middle[0,:-1])
#np.linalg.norm(a-b)用来计算a,b两点之间的距离,a.b如果是list,必须要np.array(a)进行格式转换
for i in range(1,middle.shape[0]):
dist = np.linalg.norm(datasetRow - middle[i,:-1])
if dist < minDist:
minDist = dist
label = middle[i,-1]
print('minDist',minDist)
print('label',label)
return label
def getmiddles(datatset,k):
result = np.zeros((k,datatset.shape[1]))
for i in range(1,k+1):
oneCluster = datatset[datatset[:,-1]==i,:-1]
result[i-1,:-1] = np.mean(oneCluster,axis=0)
result[i-1,-1] = i
return result
file = open(r'\boundary.csv','r')
reader = csv.reader(file)
reader = list(reader)
m,n = np.shape(reader)
for i in range(0,m):
for j in range(0,3):
#转换数据类型
reader[i][j] = float(reader[i][j])
m,n = np.shape(reader)
list1 = np.zeros([m,2])
for i in range(0,m):
for j in range(2,3):
#获取数据的z指与强度值
list1[i][j-2] = reader[i][j]
# x = np.vstack((a,b,c,d))
result = kmean(list1,4,10)###kkkkkkkkkkkkkkkkkkkk
print('result:',result[0])
print(reader[0])
reader0 = np.zeros([m,5])
for i in range(0,m):
for j in range(0,3):
reader0[i][j] = reader[i][j]
for i in range(0,m):
reader0[i][-1] = int(result[i][-1])
print(reader0)
w1=open("1.txt","w")
w2=open("2.txt","w")
w3=open("3.txt","w")
w4=open("4.txt","w")
for i in range(m):
if(reader0[i][-1]==1):
w1.write("%s %s %s %s\n"%(reader0[i][0],reader0[i][1],reader0[i][2],reader0[i][3]))
if(reader0[i][-1]==2):
w2.write("%s %s %s %s\n" % (reader0[i][0], reader0[i][1], reader0[i][2],reader0[i][3]))
if (reader0[i][-1] ==3):
w3.write("%s %s %s %s\n" % (reader0[i][0], reader0[i][1], reader0[i][2],reader0[i][3]))
if (reader0[i][-1] == 4):
w4.write("%s %s %s %s\n" % (reader0[i][0], reader0[i][1], reader0[i][2],reader0[i][3]))