k-means和EM算法适合发现凸型的聚类(大概就是圆形,椭圆形比较规则的类),而对于非凸型的聚类,这两种方法就很难找到准确的聚类了。比如如下图:
可能来自不同类的点反而比来自相同类的点还要靠的更近。
太多的原理和算法介绍,大家可以找到很多相关资料。(推荐《Data Mining and Analysis: FundamentalConcepts and Algorithms》)。下面的代码是对基于密度聚类算法的一种实现。希望能够帮助想要学习了理解这种算法的同学。
import java.util.ArrayList;
import java.util.List;
/**
*
* @author aturbo
* 基于密度的聚类算法
*/
public class MyDBSCAN {
private static final double[][] points = {
{3.0, 8.04},
{4.0, 7.95},
{4.4, 8.58},
{3.6, 8.81},
{5.0, 8.33},
{6.0, 6.96},
{17.0, 4.24},
{18.0, 4.26},
{16.0, 3.84},
{17.0, 4.82},
{15.0, 5.68},
{17.0, 5.68},
{11.0, 10.68},
{13.0, 9.68},
{11.8, 10.0},
{12.0, 11.18},
{8.0, 12.0},
{9.2, 9.68},
{8.8, 11.2},
{10.0,11.4},
{7.0, 9.68},
{6.1, 10.68},
{5.70, 1.68},
{5.0, 2.68},
{12.0, 0.68}
};
private static int minpts = 6;
private static double radius = 1.3;
private static List> clusters;
private static List cores;
/**
* 欧氏距离
* @param point1
* @param point2
* @return
*/
private static double countEurDistance(double[] point1,double[] point2){
double eurDistance = 0.0;
for(int i=0;i findCores(double[][] points,int minpts,double radius){
List cores = new ArrayList();
for(int i = 0; i < points.length;i++){
int pts = 0;
for(int j = 0; j < points.length;j++){
for(int k = 0; k < points[i].length;k++){
if(countEurDistance(points[i], points[j])=minpts){
cores.add(points[i]);
}
}
return cores;
}
/**
* put the core point to cluster and get the densityconnect
*/
private static void putCoreToCluster(){
clusters = new ArrayList>();
int clusterNum = 0;
for(int i = 0;i());
clusters.get(clusterNum).add(cores.get(i));
densityConnected(points, cores.get(i), clusterNum);
clusterNum++;
}
}
/**
*
* @param points
* @param core
* @param clusterNum
*/
private static void densityConnected(double[][] points,double[] core,int clusterNum){
boolean isputToCluster;//是否已经归为某个类
boolean isneighbour = false;//是不是core的“邻居”
cores.remove(core);//对某个core点处理后就从core集中去掉
for(int i = 0; i < points.length;i++){
isneighbour = false;
isputToCluster = false;
for(List cluster:clusters){
if(cluster.contains(points[i])){//如果已经归为某个类
isputToCluster = true;
break;
}
}
if(isputToCluster)continue;//已在聚类中,跳过,不处理
if(countEurDistance(points[i], core) cluster:clusters){
System.out.println("cluster "+ i++ +":");
for(double[] point:cluster){
System.out.println("["+point[0]+","+point[1]+"]");
}
}
int flag = 0;
for(int j = 0;j cluster:clusters){
if(cluster.contains(points[j])){
flag = 1;
break;
}
}
if(flag==0)System.out.println("noise point:"+"["+points[j][0]+","+points[j][1]+"]");
}
}
}
参考文献:
《Data Mining and Analysis: FundamentalConcepts and Algorithms》