AGNES算法

AGNES算法代码

算法原理

1.将每个对象初始化为簇对象
2,将最近的两个簇合并,使簇的总数量减一
3,重复第二步直至达到理想簇数量

由于AGNES算法原理较为简单,我将使用java泛型编程和函数式接口进行编写,以提高代码重用性。
(如果将代码稍加修改将可以用于构建哈夫曼树)

import java.util.ArrayList;
import java.util.function.BiFunction;
public class Agnes<E> {
	ArrayList<Cluster<E>> clusterList ;
	Distance<E> distance;
	int num;
	//设置距离计算方法
	public void setDistance(Distance<E> distance) {
		this.distance = distance;
	}
	public Agnes(ArrayList<E> dataList) {
		clusterList=createClusterList(dataList);
	}
	//进行AGNES聚类
	public ArrayList<Cluster<E>> clustering(int num) {
		//System.out.println("size"+clusterList.size());
		//clusterList.forEach(System.out::println);
		
		while(num>0&&clusterList.size()!=num) {
			ArrayList<Cluster<E>> list = nearestCluster(clusterList,distance);
			Cluster<E> tmp1=list.get(0);
			Cluster<E> tmp2=list.get(1);
			tmp1.addAll(tmp2);
			clusterList.remove(tmp2);
			//System.out.println("size"+clusterList.size());
			//clusterList.forEach(System.out::println);
		}
		return clusterList;
	}
	//将数据包装成簇
	public ArrayList<Cluster<E>> createClusterList(ArrayList<E> list) {
		ArrayList<Cluster<E>> clusterList=new ArrayList<Cluster<E>>();
		for(E item:list) {
			clusterList.add(new Cluster(item));
		}
		return clusterList;
	}
	//计算最近簇
	public ArrayList<Cluster<E>> nearestCluster(ArrayList<Cluster<E>> clusterList,Distance<E> distance){
		if(clusterList.size()<2)
			return null;
		Cluster<E> tmp1=null;
		Cluster<E> tmp2=null;
		double whole=Double.MAX_VALUE;
		for(int index=0;index<clusterList.size()-1;index++) {
			for(int pos=index+1;pos<clusterList.size();pos++) {
				 double value = Cluster.averageDistance(clusterList.get(index),
						 clusterList.get(index),distance).doubleValue();
				 if(value<whole) {
					 whole=value;
					 tmp1=clusterList.get(index);
					 tmp2=clusterList.get(pos);
				 }
			}
		}
		ArrayList<Cluster<E>> list=new ArrayList<Cluster<E>>();
		list.add(tmp1);
		list.add(tmp2);
		return list;
	}
}
//簇
class  Cluster<E> {
	private ArrayList<E> items =new ArrayList<E>();
	public Cluster(E... e){
		for(E item:e) {
			items.add(item);
		}
	}
	//计算两个簇的平均距离
	public static <T> Number averageDistance(Cluster<T> c1,Cluster<T> c2,Distance<T> distance) {
		double number=allDistance(c1,c2,distance).doubleValue();
		return 1.0*number/(c1.items.size()*c2.items.size());
	}
	//计算两个簇的全部距离
	public static <T> Number allDistance(Cluster<T> c1,Cluster<T> c2,Distance<T> distance) {
		double number=0;
		for(T t:c1.items) {
			for(T u:c2.items) {
				number+=distance.apply(t, u).doubleValue();
			}
		}
		return number;
	}
	public void  addAll(Cluster<E> cluster) {
		this.items.addAll(cluster.items);
	}
	public void  add(E e) {
		this.items.add(e);
	}
	public String toString() {
		return "Cluster:  "+items.toString();
	}
}
//距离函数式接口
abstract class Distance<T> implements BiFunction<T,T,Number>{
	public abstract Number apply(T t, T u);
}

数据测试

//测试数据类
class Data{
	int id,x, y;
	public Data(int id,int x,int y) {
		this.id=id;
		this.x=x;
		this.y=y;
	}
	public String toString() {
		return "Data "+id+":("+x+","+y+")";
	}
}

数据选用数据挖掘原理与算法第三版
距离公式选用欧拉距离公式
生成两个聚类

	public static void main(String[] args) {
		ArrayList<Data> dataList =new ArrayList<Data>();
		dataList.add(new Data(1,1,1));
		dataList.add(new Data(2,1,2));
		dataList.add(new Data(3,2,1));
		dataList.add(new Data(4,2,2));
		dataList.add(new Data(5,3,4));
		dataList.add(new Data(6,3,5));
		dataList.add(new Data(7,4,5));
		dataList.add(new Data(8,4,5));
		
		Agnes<Data> agnes=new Agnes<Data>(dataList);
		agnes.setDistance(new Distance<Data>() {
			public Number apply(Data t, Data u) {
				return Math.sqrt((t.x-u.x)*(t.x-u.x)+(t.y-u.y)*(t.y-u.y));
			}
		});
		ArrayList<Cluster<Data>> list = agnes.clustering(2);
	}

输出结果分析

初始数据8个
size8
Cluster: [Data 1:(1,1)]
Cluster: [Data 2:(1,2)]
Cluster: [Data 3:(2,1)]
Cluster: [Data 4:(2,2)]
Cluster: [Data 5:(3,4)]
Cluster: [Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]

size7
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1)]
Cluster: [Data 4:(2,2)]
Cluster: [Data 5:(3,4)]
Cluster: [Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]

size6
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4)]
Cluster: [Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]
size5
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5)]
Cluster: [Data 7:(4,5)]
Cluster: [Data 8:(4,5)]
size4
Cluster: [Data 1:(1,1), Data 2:(1,2)]
Cluster: [Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5)]
Cluster: [Data 7:(4,5), Data 8:(4,5)]
size3
Cluster: [Data 1:(1,1), Data 2:(1,2), Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5)]
Cluster: [Data 7:(4,5), Data 8:(4,5)]
size2
Cluster: [Data 1:(1,1), Data 2:(1,2), Data 3:(2,1), Data 4:(2,2)]
Cluster: [Data 5:(3,4), Data 6:(3,5), Data 7:(4,5), Data 8:(4,5)]

最终结果符合预期

算法分析

算法复杂度为O(N2),对于大规模数据不适用。

当多个簇距离相同时,簇合并选择困难。

算法无法回溯,不具很好的可收缩性。

你可能感兴趣的:(数据挖掘)