基于密度的聚类算法(DBSCAN)的java实现

         k-means和EM算法适合发现凸型的聚类(大概就是圆形,椭圆形比较规则的类),而对于非凸型的聚类,这两种方法就很难找到准确的聚类了。比如如下图:

       基于密度的聚类算法(DBSCAN)的java实现_第1张图片

可能来自不同类的点反而比来自相同类的点还要靠的更近。

  太多的原理和算法介绍,大家可以找到很多相关资料。(推荐《Data Mining and Analysis: FundamentalConcepts and Algorithms》)。下面的代码是对基于密度聚类算法的一种实现。希望能够帮助想要学习了理解这种算法的同学。

import java.util.ArrayList;
import java.util.List;
/**
 * 
 * @author aturbo
 * 基于密度的聚类算法
 */
public class MyDBSCAN {
    private static final double[][] points =  {
                                               {3.0, 8.04},
                                               {4.0, 7.95},
                                               {4.4, 8.58},
                                               {3.6, 8.81},
                                               {5.0, 8.33},
                                               {6.0, 6.96},
                                               {17.0, 4.24},
                                               {18.0, 4.26},
                                               {16.0, 3.84},
                                               {17.0, 4.82},
                                               {15.0, 5.68},
                                               {17.0, 5.68},
                                               {11.0, 10.68},
                                               {13.0, 9.68},
                                               {11.8, 10.0},
                                               {12.0, 11.18},
                                               {8.0, 12.0},
                                               {9.2, 9.68},
                                               {8.8, 11.2},
                                               {10.0,11.4},
                                               {7.0, 9.68},
                                               {6.1, 10.68},
                                               {5.70, 1.68},
                                               {5.0, 2.68},
                                               {12.0, 0.68}
    };
    private static int minpts = 6;
    private static double radius = 1.3;
    private static List> clusters;
    private static List cores;
    
    /**
     * 欧氏距离
     * @param point1
     * @param point2
     * @return
     */
	private static double countEurDistance(double[] point1,double[] point2){
		double eurDistance = 0.0;
		for(int i=0;i findCores(double[][] points,int minpts,double radius){
       List cores = new ArrayList();
       for(int i = 0; i < points.length;i++){
    	   int pts = 0;
    	   for(int j = 0; j < points.length;j++){
    		   for(int k = 0; k < points[i].length;k++){
    			   if(countEurDistance(points[i], points[j])=minpts){
    		   cores.add(points[i]);
    	   }
       }
       return cores;
    }
    /**
     * put the core point to cluster and get the densityconnect
     */
    private static void putCoreToCluster(){
        clusters = new ArrayList>();
        int clusterNum = 0;
    	for(int i = 0;i());
    	  	clusters.get(clusterNum).add(cores.get(i));
    	  	densityConnected(points, cores.get(i), clusterNum);
    	  	clusterNum++;
    	}
    }
    /**
     * 
     * @param points
     * @param core
     * @param clusterNum
     */
    private static void densityConnected(double[][] points,double[] core,int clusterNum){
    	boolean isputToCluster;//是否已经归为某个类
    	boolean isneighbour = false;//是不是core的“邻居”
    	cores.remove(core);//对某个core点处理后就从core集中去掉
    	for(int i = 0; i < points.length;i++){
    		isneighbour = false;
    		isputToCluster = false;
    		for(List cluster:clusters){
    			if(cluster.contains(points[i])){//如果已经归为某个类
    				isputToCluster = true;
    				break;
    			}
    		}
    		if(isputToCluster)continue;//已在聚类中,跳过,不处理
    		if(countEurDistance(points[i], core) cluster:clusters){
			System.out.println("cluster "+ i++ +":");
			for(double[] point:cluster){
				System.out.println("["+point[0]+","+point[1]+"]");
			}			
		}
		int flag = 0;
		for(int j = 0;j cluster:clusters){
				if(cluster.contains(points[j])){
					flag = 1;
					break;
				}
			}
			if(flag==0)System.out.println("noise point:"+"["+points[j][0]+","+points[j][1]+"]");
		}
	}
}

 具体算法流程:

  基于密度的聚类算法(DBSCAN)的java实现_第2张图片


参考文献:

   《Data Mining and Analysis: FundamentalConcepts and Algorithms》

你可能感兴趣的:(数据挖掘)