1.将每个对象初始化为簇对象
2,将最近的两个簇合并,使簇的总数量减一
3,重复第二步直至达到理想簇数量
由于AGNES算法原理较为简单,我将使用java泛型编程和函数式接口进行编写,以提高代码重用性。
(如果将代码稍加修改将可以用于构建哈夫曼树)
import java.util.ArrayList;
import java.util.function.BiFunction;
public class Agnes<E> {
ArrayList<Cluster<E>> clusterList ;
Distance<E> distance;
int num;
//设置距离计算方法
public void setDistance(Distance<E> distance) {
this.distance = distance;
}
public Agnes(ArrayList<E> dataList) {
clusterList=createClusterList(dataList);
}
//进行AGNES聚类
public ArrayList<Cluster<E>> clustering(int num) {
//System.out.println("size"+clusterList.size());
//clusterList.forEach(System.out::println);
while(num>0&&clusterList.size()!=num) {
ArrayList<Cluster<E>> list = nearestCluster(clusterList,distance);
Cluster<E> tmp1=list.get(0);
Cluster<E> tmp2=list.get(1);
tmp1.addAll(tmp2);
clusterList.remove(tmp2);
//System.out.println("size"+clusterList.size());
//clusterList.forEach(System.out::println);
}
return clusterList;
}
//将数据包装成簇
public ArrayList<Cluster<E>> createClusterList(ArrayList<E> list) {
ArrayList<Cluster<E>> clusterList=new ArrayList<Cluster<E>>();
for(E item:list) {
clusterList.add(new Cluster(item));
}
return clusterList;
}
//计算最近簇
public ArrayList<Cluster<E>> nearestCluster(ArrayList<Cluster<E>> clusterList,Distance<E> distance){
if(clusterList.size()<2)
return null;
Cluster<E> tmp1=null;
Cluster<E> tmp2=null;
double whole=Double.MAX_VALUE;
for(int index=0;index<clusterList.size()-1;index++) {
for(int pos=index+1;pos<clusterList.size();pos++) {
double value = Cluster.averageDistance(clusterList.get(index),
clusterList.get(index),distance).doubleValue();
if(value<whole) {
whole=value;
tmp1=clusterList.get(index);
tmp2=clusterList.get(pos);
}
}
}
ArrayList<Cluster<E>> list=new ArrayList<Cluster<E>>();
list.add(tmp1);
list.add(tmp2);
return list;
}
}
//簇
class Cluster<E> {
private ArrayList<E> items =new ArrayList<E>();
public Cluster(E... e){
for(E item:e) {
items.add(item);
}
}
//计算两个簇的平均距离
public static <T> Number averageDistance(Cluster<T> c1,Cluster<T> c2,Distance<T> distance) {
double number=allDistance(c1,c2,distance).doubleValue();
return 1.0*number/(c1.items.size()*c2.items.size());
}
//计算两个簇的全部距离
public static <T> Number allDistance(Cluster<T> c1,Cluster<T> c2,Distance<T> distance) {
double number=0;
for(T t:c1.items) {
for(T u:c2.items) {
number+=distance.apply(t, u).doubleValue();
}
}
return number;
}
public void addAll(Cluster<E> cluster) {
this.items.addAll(cluster.items);
}
public void add(E e) {
this.items.add(e);
}
public String toString() {
return "Cluster: "+items.toString();
}
}
//距离函数式接口
abstract class Distance<T> implements BiFunction<T,T,Number>{
public abstract Number apply(T t, T u);
}
//测试数据类
class Data{
int id,x, y;
public Data(int id,int x,int y) {
this.id=id;
this.x=x;
this.y=y;
}
public String toString() {
return "Data "+id+":("+x+","+y+")";
}
}
数据选用数据挖掘原理与算法第三版
距离公式选用欧拉距离公式
生成两个聚类
public static void main(String[] args) {
ArrayList<Data> dataList =new ArrayList<Data>();
dataList.add(new Data(1,1,1));
dataList.add(new Data(2,1,2));
dataList.add(new Data(3,2,1));
dataList.add(new Data(4,2,2));
dataList.add(new Data(5,3,4));
dataList.add(new Data(6,3,5));
dataList.add(new Data(7,4,5));
dataList.add(new Data(8,4,5));
Agnes<Data> agnes=new Agnes<Data>(dataList);
agnes.setDistance(new Distance<Data>() {
public Number apply(Data t, Data u) {
return Math.sqrt((t.x-u.x)*(t.x-u.x)+(t.y-u.y)*(t.y-u.y));
}
});
ArrayList<Cluster<Data>> list = agnes.clustering(2);
}
初始数据8个
size8
Cluster: [Data 1:(1,1)]
Cluster: [Data 2:(1,2)]
Cluster: [Data 3:(2,1)]
Cluster: [Data 4:(2,2)]
Cluster: [Data 5:(3,4)]
Cluster: [Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]
size7
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1)]
Cluster: [Data 4:(2,2)]
Cluster: [Data 5:(3,4)]
Cluster: [Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]
size6
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4)]
Cluster: [Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]
size5
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]
size4
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5)]
Cluster: [Data 7:(4,5), Data 8:(4,5)]
size3
Cluster: [Data 1:(1,1), Data 2:(1,2), Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5)]
Cluster: [Data 7:(4,5), Data 8:(4,5)]
size2
Cluster: [Data 1:(1,1), Data 2:(1,2), Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5), Data 7:(4,5), Data 8:(4,5)]
最终结果符合预期