算法:DBScan,基于密度的聚类算法
输入:
D:一个包含n个数据的数据集
r:半径参数
minPts:领域密度阈值
输出:基于密度的聚类集合
标记D中所有的点为unvisted
for each p in D
if p.visit = unvisted
找出与点p距离不大于r的所有点集合N
If N.size() < minPts
标记点p为噪声点
Else
for each p' in N
If p'.visit == unvisted
找出与点p距离不大于r的所有点集合N'
If N'.size()>=minPts
将集合N'加入集合N中去
End if
Else
If p'未被聚到某个簇
将p'聚到当前簇
If p'被标记为噪声点
将p'取消标记为噪声点
End If
End If
End If
End for
End if
End if
End for
Point.java 定义点,其中距离计算采用欧式距离
package Cluster.DBScan;
/**
* Created by Jason on 2016/4/17.
*/
public class Point {
private double x;
private double y;
private boolean isVisit;
private int cluster;
private boolean isNoised;
public Point(double x,double y) {
this.x = x;
this.y = y;
this.isVisit = false;
this.cluster = 0;
this.isNoised = false;
}
public double getDistance(Point point) {
return Math.sqrt((x-point.x)*(x-point.x)+(y-point.y)*(y-point.y));
}
public void setX(double x) {
this.x = x;
}
public double getX() {
return x;
}
public void setY(double y) {
this.y = y;
}
public double getY() {
return y;
}
public void setVisit(boolean isVisit) {
this.isVisit = isVisit;
}
public boolean getVisit() {
return isVisit;
}
public int getCluster() {
return cluster;
}
public void setNoised(boolean isNoised) {
this.isNoised = isNoised;
}
public void setCluster(int cluster) {
this.cluster = cluster;
}
public boolean getNoised() {
return this.isNoised;
}
@Override
public String toString() {
return x+" "+y+" "+cluster+" "+(isNoised?1:0);
}
}
DBScan.java
package Cluster.DBScan;
import java.util.ArrayList;
/**
* Created by Jason on 2016/4/17.
*/
public class DBScan {
private double radius;
private int minPts;
public DBScan(double radius,int minPts) {
this.radius = radius;
this.minPts = minPts;
}
public void process(ArrayList points) {
int size = points.size();
int idx = 0;
int cluster = 1;
while (idx//choose an unvisited point
if (!p.getVisit()) {
p.setVisit(true);//set visited
ArrayList adjacentPoints = getAdjacentPoints(p, points);
//set the point which adjacent points less than minPts noised
if (adjacentPoints != null && adjacentPoints.size() < minPts) {
p.setNoised(true);
} else {
p.setCluster(cluster);
for (int i = 0; i < adjacentPoints.size(); i++) {
Point adjacentPoint = adjacentPoints.get(i);
//only check unvisited point, cause only unvisited have the chance to add new adjacent points
if (!adjacentPoint.getVisit()) {
adjacentPoint.setVisit(true);
ArrayList adjacentAdjacentPoints = getAdjacentPoints(adjacentPoint, points);
//add point which adjacent points not less than minPts noised
if (adjacentAdjacentPoints != null && adjacentAdjacentPoints.size() >= minPts) {
adjacentPoints.addAll(adjacentAdjacentPoints);
}
}
//add point which doest not belong to any cluster
if (adjacentPoint.getCluster() == 0) {
adjacentPoint.setCluster(cluster);
//set point which marked noised before non-noised
if (adjacentPoint.getNoised()) {
adjacentPoint.setNoised(false);
}
}
}
cluster++;
}
}
}
}
private ArrayList getAdjacentPoints(Point centerPoint,ArrayList points) {
ArrayList adjacentPoints = new ArrayList();
for (Point p:points) {
//include centerPoint itself
double distance = centerPoint.getDistance(p);
if (distance<=radius) {
adjacentPoints.add(p);
}
}
return adjacentPoints;
}
}
Data.java 随机模拟产生数据
package Cluster.DBScan;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Random;
/**
* Created by Jason on 2016/4/17.
*/
public class Data {
private static DecimalFormat df=(DecimalFormat) NumberFormat.getInstance();
public static ArrayList generateSinData(int size) {
ArrayList points = new ArrayList(size);
Random rd = new Random(size);
for (int i=0;i2;i++) {
double x = format(Math.PI / (size / 2) * (i + 1));
double y = format(Math.sin(x)) ;
points.add(new Point(x,y));
}
for (int i=0;i2;i++) {
double x = format(1.5 + Math.PI / (size/2) * (i+1));
double y = format(Math.cos(x));
points.add(new Point(x,y));
}
return points;
}
public static ArrayList generateSpecialData() {
ArrayList points = new ArrayList();
points.add(new Point(2,2));
points.add(new Point(3,1));
points.add(new Point(3,4));
points.add(new Point(3,14));
points.add(new Point(5,3));
points.add(new Point(8,3));
points.add(new Point(8,6));
points.add(new Point(9,8));
points.add(new Point(10,4));
points.add(new Point(10,7));
points.add(new Point(10,10));
points.add(new Point(10,14));
points.add(new Point(11,13));
points.add(new Point(12,7));
points.add(new Point(12,15));
points.add(new Point(14,7));
points.add(new Point(14,9));
points.add(new Point(14,15));
points.add(new Point(15,8));
return points;
}
public static void writeData(ArrayList points,String path) {
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(path));
for (Point point:points) {
bw.write(point.toString()+"\n");
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static double format(double x) {
return Double.valueOf(df.format(x));
}
}
Client.java 运行聚类算法
package Cluster.DBScan;
import java.util.ArrayList;
/**
* Created by Jason on 2016/4/17.
*/
public class Client {
public static void main(String[] args) {
//ArrayList points = Data.generateSinData(200);
//DBScan dbScan = new DBScan(0.6,4);
ArrayList points = Data.generateSpecialData();
DBScan dbScan = new DBScan(3,3);
dbScan.process(points);
for (Point p:points) {
System.out.println(p);
}
Data.writeData(points,"data.txt");
}
}
数据展示采用matlab绘制,
a = importdata('data.txt');
m=size(a,1);
for i=1:1:m
if a(i,3)==1
plot(a(i,1),a(i,2),'r.');
elseif a(i,3)==2
plot(a(i,1),a(i,2),'b.');
else
plot(a(i,1),a(i,2),'k*');
end
hold on;
end
grid on;
数据1,Data.generateSinData(200),聚类效果
数据2,Data.generateSpecialData(),聚类效果
不同颜色代表不同类,*代表噪声点
所有代码下载:https://github.com/lincolnmi/algorithms/tree/master/src/Cluster/DBScan
参考链接:
http://www.cnblogs.com/aijianiula/p/4339960.html
http://www.dataguru.cn/thread-18180-1-1.html