birch聚类算法的原理与实现

参考百度百科http://baike.baidu.com/link?url=LDYen7bEqt8o2l5mUrnZjQk1topFi36-MwLuhjuGf-1z4sQFtFq1xCEe0TCJwYVjGbu0C6cpuVMFIxNglvSnoa

外加http://www.cnblogs.com/zhangchaoyang/articles/2200800.html

学习birch聚类最好有B-树的知识

结合了B-树的特性,birch算法适合于处理大数据。

原因是:

(1)CF 结构概括了簇的基本信息,并且是高度压缩的,它存储了小于实际数据点的聚类信息。每个新添加的数据其作为个体消失了,将信息融入的集合簇中

(2)增量式的学习方法,不用一次将数据全部加载到内存,可以一边添加数据一边进行学习


下面是我的实现


// birch-cluster.cpp : 定义控制台应用程序的入口点。
//

///************birch-cluster*************///
///*******   author Marshall     ********///
///*******   2015.9.18           ********///
///*******   version 1.0         ********///


#include "stdafx.h"
#include<vector>
#include<iostream>
#include<cstdlib>
#include<time.h>

#define BirchType int

using namespace std;



vector<BirchType> operator+(vector<BirchType>aa, vector<BirchType>&bb){
	_ASSERTE(aa.size() == bb.size());
	for (int i = 0; i < aa.size(); i++)
		aa[i] += bb[i];
	return aa;
}

vector<BirchType> operator*(vector<BirchType>aa, vector<BirchType>&bb){
	_ASSERTE(aa.size() == bb.size());
	for (int i = 0; i < aa.size(); i++)
		aa[i] *= bb[i];
	return aa;
}

vector<BirchType> operator-(vector<BirchType>aa, vector<BirchType>&bb){
	_ASSERTE(aa.size() == bb.size());
	for (int i = 0; i < aa.size(); i++)
		aa[i] -= bb[i];
	return aa;
}

vector<BirchType> operator*(vector<BirchType>aa, double k){

	for (int i = 0; i < aa.size(); i++)
		aa[i] = double(aa[i])* k;
	return aa;
}
vector<BirchType> operator*(int k, vector<BirchType>aa){

	for (int i = 0; i < aa.size(); i++)
		aa[i] *= k;
	return aa;
}
class birch
{
public:
	struct Attribute
	{
		unsigned int dim;
		vector<BirchType>data;
		Attribute(unsigned int d) :dim(d)
		{
			data.resize(dim);
		}
	};
	struct CF
	{
		unsigned int N;
		vector<BirchType> LS;
		vector<BirchType> SS;
		CF(unsigned int N,
			vector<BirchType> LS,
			vector<BirchType>SS) :N(N), LS(LS), SS(SS){}
		/*CF(CF& cc){//shallow copy is enough
			this->N = cc.N;
			this->LS = cc.LS;
			this->SS = cc.SS;
			}*/
		CF(unsigned int dim){
			N = 0;
			LS.resize(dim);
			SS.resize(dim);
		};
		CF(){};
	};

	struct Leaf;
	struct MinCluster
	{
		CF cf;
		Leaf*parent;
		MinCluster()
		{
			parent = NULL;
		}
		MinCluster(CF cf)
		{
			parent = NULL;
			this->cf = cf;
		}
	};

	struct Leaf
	{
		Leaf*pre, *next;//to make up a leaf-list.for Nonleaf,NULL
		Leaf*parent;
		vector<Leaf*>*child;//对Leaf而言为NULL
		vector<MinCluster>*cluster;//对NonLeaf而言为NULL
		CF cf;
		Leaf()
		{
			parent = pre = next = NULL;
			child = NULL;
			cluster = NULL;
		}
	};
	void generate_data(int num, int dim, vector<int>&span)
	{
		this->dim = dim;
		_ASSERTE(span.size() == dim);
		for (int i = 0; i < num; i++)
		{
			Attribute att(dim);
			for (int j = 0; j < dim; j++)
				att.data[j] = span[j] * double(rand()) / double(RAND_MAX + 1.0);
			dataset.push_back(att);
		}
	}
	vector<Attribute>dataset;

	int absorbnum;

public:
	birch(unsigned int b, unsigned int l, unsigned int t)
		:B(b), L(l), T(t){
		_ASSERTE(B > 2);
		_ASSERTE(L > 3);
		root = NULL;
		time_t tt;
		srand(time(&tt));
		absorbnum = 0;
	}
	~birch();
	void insert(Attribute att);


private:

	unsigned int B; //maximal num of child a Nonleaf will have
	unsigned int L;//maximal num of MinCluster a leaf will haveLeaf
	unsigned int T;// MinCluster的直径不能超过T
	Leaf*root;
	Leaf*head;//the head of the leaf-list at the bottom of the tree
	int dim;


private:
	inline double lengthofvec(vector<BirchType>&aa){
		double len = 0;
		for (int i = 0; i < aa.size(); i++)
			len += pow(aa[i], 2.0);
		return sqrt(len);
	}
	double sumofvec(vector<BirchType>&aa){
		double sum = 0;
		for (int i = 0; i < aa.size(); i++)
			sum += aa[i];
		return sum;
	}

	double cal_inter_cluster_dis(CF &cf1, CF &cf2);
	double cal_intra_cluster_dis();
	double merge_cluster_diameter(CF &cf1, CF &cf2);
	vector<BirchType>updateSS(vector<BirchType>&LS, vector<BirchType>&SS)
	{
		for (int i = 0; i < LS.size(); i++)
			SS[i] += pow(LS[i], 2.0);
		return SS;
	}
	CF updateCF(CF &c1, CF &c2)
	{
		return CF(c1.N + c2.N, c1.LS + c2.LS, c1.SS + c2.SS);
	}
	void updateCF(Leaf*leaf)
	{
		CF cf(dim);
		if (leaf->cluster != NULL)
		{

			for (int i = 0; i < leaf->cluster->size(); i++)
			{
				cf.N = cf.N + (*leaf->cluster)[i].cf.N;
				cf.LS = cf.LS + (*leaf->cluster)[i].cf.LS;
				cf.SS = cf.SS + (*leaf->cluster)[i].cf.SS;
			}
		}
		else if (leaf->child != NULL)
		{
			for (int i = 0; i < leaf->child->size(); i++)
			{
				cf.N = cf.N + (*leaf->child)[i]->cf.N;
				cf.LS = cf.LS + (*leaf->child)[i]->cf.LS;
				cf.SS = cf.SS + (*leaf->child)[i]->cf.SS;
			}
		}
		leaf->cf = cf;
	}

	MinCluster create_mincluster(Attribute att)
	{
		vector<BirchType>aa;
		aa.resize(att.dim);
		return MinCluster(CF(1, att.data, updateSS(att.data, aa)));
	}

	void insert(Leaf*close, bool &split, MinCluster &clu);



};

birch::~birch()
{
	Leaf*plist = head;
	while (plist != NULL)
	{
		delete plist->cluster;
		plist = plist->next;
	}
	vector<Leaf*>aa, bb;
	aa.push_back(root);
	while (!aa.empty())
	{
		Leaf*pleaf = aa.back();
		aa.pop_back();
		bb.push_back(pleaf);
		if (pleaf->child != NULL)
			aa.insert(aa.end(), pleaf->child->begin(), pleaf->child->end());
	}
	for (int i = 0; i < bb.size(); i++)
	{
		if (bb[i]->child != NULL)
			delete bb[i]->child;
		delete bb[i];
	}
}
/*double birch::merge_cluster_diameter(CF &cf1, CF &cf2)
{
return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N))
+ cf2.SS *(1.0 / double(cf1.N)) -
2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N))));
}*/

double birch::merge_cluster_diameter(CF &cf1, CF &cf2)
{
	return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N))
		+ cf2.SS *(1.0 / double(cf1.N)) -
		2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N))));
}


void birch::insert(Attribute att)
{
	if (root == NULL)
	{
		root = new Leaf;
		root->cluster = new vector < MinCluster > ;
		(*root->cluster).push_back(create_mincluster(att));
		root->cf = CF((*root->cluster)[0].cf);
		head = root;
		head->pre = NULL;
		head->next = NULL;
		return;
	}
	MinCluster clu = create_mincluster(att);
	Leaf*leaf = root;

	vector<int>path;

	while (leaf->cluster == NULL)
	{
		int k = -1;
		double mindis = 10000000000000;
		double dd;
		for (int i = 0; i < (*leaf->child).size(); i++)
		{
			double dis = cal_inter_cluster_dis(clu.cf, (*leaf->child)[i]->cf);
			if (dis < mindis)
			{
				mindis = dis;
				k = i;
			}
			dd = dis;
		}

		_ASSERTE(k >= 0);
		path.push_back(k);
		leaf = (*leaf->child)[k];
	}



	int k = -1;
	//mindis = 100000;
	double mindis = 100000;
	for (int i = 0; i < (*leaf->cluster).size(); i++)
	{
		double dis = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[i].cf);
		if (dis < mindis)
		{
			mindis = dis;
			k = i;
		}
		_ASSERTE(k >= 0);
	}
	//double ttt = merge_cluster_diameter(clu.cf, (*leaf->cluster)[k].cf);

	double ttt = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[k].cf);
	if (ttt < T)
	{
		//absorb
		(*leaf->cluster)[k].cf = updateCF((*leaf->cluster)[k].cf, clu.cf);
		absorbnum++;
	}
	else
	{
		(*leaf->cluster).push_back(clu);
	}
	//update CF value along the path
	Leaf*lea = root;
	(*lea).cf = updateCF((*lea).cf, clu.cf);
	for (int i = 0; i < path.size(); i++)
	{
		(*lea->child)[path[i]]->cf = updateCF((*lea->child)[path[i]]->cf, clu.cf);
		lea = (*lea->child)[path[i]];
	}

	if ((*leaf->cluster).size() > L)
	{
		double maxdis = 0;
		int th1 = -1;
		int th2 = -1;
		double**dismatrix = new double*[(*leaf->cluster).size()];
		for (int i = 0; i < (*leaf->cluster).size(); i++)
			dismatrix[i] = new double[(*leaf->cluster).size()];
		//找到距离最远的两个簇
		for (int i = 0; i < (*leaf->cluster).size() - 1; i++)
			for (int h = i + 1; h < (*leaf->cluster).size(); h++)
			{
				double dis = cal_inter_cluster_dis((*leaf->cluster)[i].cf, (*leaf->cluster)[h].cf);
				dismatrix[i][h] = dis;
				dismatrix[h][i] = dis;
				if (dis > maxdis)
				{
					maxdis = dis;
					th1 = i; th2 = h;
				}
			}
		Leaf*new_leaf = new Leaf;
		new_leaf->cluster = new vector < MinCluster > ;
		new_leaf->cluster->push_back((*leaf->cluster)[th2]);
		int len = (*leaf->cluster).size();
		(*leaf->cluster)[th2].parent = new_leaf;

		//根据各簇与两个新簇的距离分配到两个新簇中
		for (int i = 0; i < len; i++)
		{
			if (i == th1 || i == th2)
				continue;
			if (dismatrix[i][th2] < dismatrix[i][th1])
			{
				(*leaf->cluster)[i].parent = new_leaf;
				new_leaf->cluster->push_back((*leaf->cluster)[i]);

			}
		}
		for (int i = 0; i < (*leaf->cluster).size(); i++)
			delete[] dismatrix[i];
		delete[]dismatrix;

		vector < MinCluster >::iterator it, it1;
		it = (*leaf->cluster).begin();
		while (it != (*leaf->cluster).end())
		{
			if (it->parent == new_leaf)
				it = (*leaf->cluster).erase(it);
			else
			{
				it++;
			}
		}
		//不要忘了更新leaf和new_leaf的cf值
		updateCF(leaf);
		updateCF(new_leaf);
		//不要忘了将new_leaf加入到链表中
		Leaf*next = leaf->next;
		leaf->next = new_leaf;
		new_leaf->pre = leaf;
		new_leaf->next = next;
		if (next)
			next->pre = new_leaf;
		if (leaf->parent != NULL)
		{
			leaf->parent->child->push_back(new_leaf);
			new_leaf->parent = leaf->parent;
		}
		else//leaf is root,then a new root should be created
		{
			Leaf*new_root = new Leaf;
			new_root->child = new vector < Leaf* > ;
			new_root->child->push_back(leaf);
			new_root->child->push_back(new_leaf);
			leaf->parent = new_root;
			new_leaf->parent = new_root;
			updateCF(new_root);
			root = new_root;
			return;
		}
	}
	Leaf*cur = leaf->parent;
	while (cur != NULL&&cur->child->size() > B)
	{
		double maxdis = 0;
		int th1 = -1;
		int th2 = -1;
		double**dismatrix = new double*[cur->child->size()];
		for (int i = 0; i < cur->child->size(); i++)
			dismatrix[i] = new double[cur->child->size()];
		//找到距离最远的两个leaf
		for (int i = 0; i < cur->child->size() - 1; i++)
			for (int h = i + 1; h < cur->child->size(); h++)
			{
				double dis = cal_inter_cluster_dis((*cur->child)[i]->cf, (*cur->child)[h]->cf);
				dismatrix[i][h] = dis;
				dismatrix[h][i] = dis;
				if (dis > maxdis)
				{
					maxdis = dis;
					th1 = i; th2 = h;
				}
			}

		Leaf*new_leaf1 = new Leaf;
		new_leaf1->child = new vector < Leaf* > ;
		(*cur->child)[th2]->parent = new_leaf1;
		(*new_leaf1->child).push_back((*cur->child)[th2]);
		int len = (*cur->child).size();

		//rearrange other leaves to th1 th2 as their child
		for (int i = 0; i < len; i++)
		{
			if (i == th1 || i == th2)
				continue;
			if (dismatrix[i][th2] < dismatrix[i][th1])
			{
				(*cur->child)[i]->parent = new_leaf1;
				new_leaf1->child->push_back((*cur->child)[i]);

			}
		}
		for (int i = 0; i < (*cur->child).size(); i++)
			delete[] dismatrix[i];
		delete[]dismatrix;

		vector < Leaf* >::iterator it;
		it = (*cur->child).begin();
		while (it != (*cur->child).end())
		{
			if ((*it)->parent == new_leaf1)
				it = (*cur->child).erase(it);
			else
				it++;
		}
		//不要忘了更新cur和new_leaf1的cf值
		updateCF(cur);
		updateCF(new_leaf1);

		//if cur is root,then a new root should be created
		if (cur->parent == NULL)
		{
			Leaf*new_root = new Leaf;
			new_root->child = new vector < Leaf* > ;
			new_root->child->push_back(cur);
			new_root->child->push_back(new_leaf1);
			cur->parent = new_root;
			new_leaf1->parent = new_root;
			updateCF(new_root);
			root = new_root;
			return;
		}

		//cur is not root
		//不要忘了将new_leaf1加入cur的父亲节点的child
		cur->parent->child->push_back(new_leaf1);
		new_leaf1->parent = cur->parent;
		cur = cur->parent;
	}

}



//根据CF值计算簇间距离
/*double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2)
{
return sqrt(sumofvec((2 * (cf1.N + cf2.N)*(cf1.SS + cf2.SS)
- 2 * (cf1.LS + cf2.LS)*(cf1.LS + cf2.LS))*
(1.0 / double(cf1.N + cf2.N)*(cf1.N + cf2.N - 1))));
}*/

double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2)
{
	double dis = 0;
	double temp;
	for (int i = 0; i < dim; i++)
	{
		double t1 = double(cf1.LS[i]) / double(cf1.N);
		double t2 = double(cf2.LS[i]) / double(cf2.N);
		temp = t1 - t2;
		dis += temp*temp;
	}

	return sqrt(dis);
}




int _tmain(int argc, _TCHAR* argv[])
{
	//vector<int*>aa, bb;
	//int *p1 = new int;
	//int *p2 = new int;
	//int *p3 = new int;
	//*p1 = 8;
	//*p2 = 9;
	//*p3 = 88;
	//aa.push_back(p1);
	//aa.push_back(p2);
	//aa.push_back(p3);
	//*aa[2] = 999;
	//bb.push_back(p3);
	//vector<int*>::iterator it = aa.begin() + 1;
	////delete aa[0];
	//it = aa.erase(it);

	//cout << *bb[0] << endl;
	//cout << **it << endl;
	//for (it = aa.begin(); it != aa.end(); it++)
	//	cout << **it << endl;

	birch bir(5, 6, 20);
	int dim = 2;
	int num = 1000;
	vector<int>span;
	for (int i = 0; i < dim; i++)
		span.push_back(1000);
	bir.generate_data(num, dim, span);
	for (int i = 0; i < num; i++)
		bir.insert(bir.dataset[i]);
	cout << bir.absorbnum << endl;
	system("pause");
	return 0;
}


你可能感兴趣的:(birch聚类)