python实现DBSCAN 算法

算法介绍参照此文章

DBSCAN 算法

核心思路:

  • 首选任意选取一个点,然后找到到这个点距离小于等于 eps 的所有的点。如果距起始点的距离在 eps 之内的数据点个数小于 min_samples,那么这个点被标记为噪声。如果距离在 eps 之内的数据点个数大于 min_samples,则这个点被标记为核心样本,并被分配一个新的簇标签。
  • 然后访问该点的所有邻居(在距离 eps 以内)。如果它们还没有被分配一个簇,那么就将刚刚创建的新的簇标签分配给它们。如果它们是核心样本,那么就依次访问其邻居,以此类推。簇逐渐增大,直到在簇的 eps 距离内没有更多的核心样本为止。
  • 选取另一个尚未被访问过的点,并重复相同的过程。

python实现DBSCAN 算法_第1张图片

算法实现

"""
    AUTHOR: chenyi
    DATE: 2021-11-13

    Density-Based Spatial Clustering of Applications with Noise
    In order to visualize the result in a 2d dimension,
    let's assume that the dimension af the points would only be 2.

    Core Params:
        - eps: the radius of the cluster.
        - minDots: the min number of the points the cluster should have to be a cluster.

    Point {
        visited -> bool: indicate whether a point is visited.
        coords -> (float, float): describe a point's coordination.
        cluster -> int: indicate to which cluster the point belongs.
    }

"""

import random
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np

class Point:
    """DEFINITION"""

    def __init__(self):
        self.visited = False
        self.coords = (random.random() * 10, random.random() * 10)
        self.cluster = None

    def __int__(self, coords):
        self.visited = False
        self.coords = coords
        self.cluster = None

    def cal_distance(self, other):
        d = sqrt((other.coords[0] - self.coords[0]) ** 2 + (other.coords[1] - self.coords[1]) ** 2)
        return d

    def set_cluster(self, id):
        self.visited = True
        self.cluster = id

    def to_string(self):
        return "cluster:{}\tvisited: {}\tcoords:{}".format(self.cluster, self.visited, self.coords)

class DBSCAN:
    """DEFINITION"""

    def __init__(self, n, eps, minPoints):
        self.points = [Point() for i in range(n)]
        self.eps = eps
        self.minPoints = minPoints
        self.cluster = 0    # save the last cluster id

    def is_core(self, point: Point):
        """ determine whether a point is the core of the cluster  """
        dots = 0

        points = self.points.copy()
        points.remove(point)
        for ele in points:
            d = ele.cal_distance(point)
            dots += d < self.eps

        return dots >= self.minPoints

    def pick_unvisited_point(self):
        """ randomly pick an unvisited point """
        unvisited = list(filter(lambda x: not x.visited, self.points.copy()))
        return random.choice(unvisited)

    def is_all_visited(self):
        """ check whether all points were visited """
        for ele in self.points:
            if not ele.visited:
                return False
        return True

    def find_all_adjacent_points(self, point: Point):
        points = self.points.copy()
        points.remove(point)
        adjacent_points = []

        for ele in points:
            if point.cal_distance(ele) < self.eps:
                adjacent_points.append(ele)

        return adjacent_points

    def DFS(self, core: Point):
        """  """
        # break condition:
        # if core.visited or self.is_all_visited():
        #     return

        adjacent_points = self.find_all_adjacent_points(core)

        for ele in adjacent_points:
            if ele.visited:
                continue
            ele.set_cluster(self.cluster)
            if self.is_core(ele):
                self.DFS(ele)

    def print_result(self):
        for ele in self.points:
            print(ele.to_string())

    def draw(self):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.xlim(0, 10)
        plt.ylim(0, 10)

        for i in range(0, self.cluster):
            points = list(filter(lambda e: e.cluster == i, self.points))
            coords = list(map(lambda e: e.coords, points))
            x = [coord[0] for coord in coords]
            y = [coord[1] for coord in coords]
            plt.scatter(x, y)
            for j in range(len(x)):
                circ = plt.Circle((x[j], y[j]), self.eps, fill=False)  # center, radius
                ax.add_patch(circ)

        # draw noise
        points = list(filter(lambda e: e.cluster == -1, self.points))
        coords = list(map(lambda e: e.coords, points))
        x = [coord[0] for coord in coords]
        y = [coord[1] for coord in coords]
        plt.scatter(x, y, c='r', marker='x')

        plt.show()

def main():
    model = DBSCAN(10, 2, 1)

    while not model.is_all_visited():

        point = model.pick_unvisited_point()
        if model.is_core(point):
            model.DFS(point)
            model.cluster += 1
        else:
            point.set_cluster(-1)   # -1 means the point belongs to noise.

    model.print_result()
    model.draw()

if __name__ == '__main__':
    main()

N=10 eps=2 minDots=1的运行结果
python实现DBSCAN 算法_第2张图片
N=20 dim =2 num=2的运行结果
python实现DBSCAN 算法_第3张图片

你可能感兴趣的:(python,算法)