参考百度百科http://baike.baidu.com/link?url=LDYen7bEqt8o2l5mUrnZjQk1topFi36-MwLuhjuGf-1z4sQFtFq1xCEe0TCJwYVjGbu0C6cpuVMFIxNglvSnoa
外加http://www.cnblogs.com/zhangchaoyang/articles/2200800.html
学习birch聚类最好有B-树的知识
结合了B-树的特性,birch算法适合于处理大数据。
原因是:
(1)CF 结构概括了簇的基本信息,并且是高度压缩的,它存储了小于实际数据点的聚类信息。每个新添加的数据其作为个体消失了,将信息融入的集合簇中
(2)增量式的学习方法,不用一次将数据全部加载到内存,可以一边添加数据一边进行学习
下面是我的实现
// birch-cluster.cpp : 定义控制台应用程序的入口点。 // ///************birch-cluster*************/// ///******* author Marshall ********/// ///******* 2015.9.18 ********/// ///******* version 1.0 ********/// #include "stdafx.h" #include<vector> #include<iostream> #include<cstdlib> #include<time.h> #define BirchType int using namespace std; vector<BirchType> operator+(vector<BirchType>aa, vector<BirchType>&bb){ _ASSERTE(aa.size() == bb.size()); for (int i = 0; i < aa.size(); i++) aa[i] += bb[i]; return aa; } vector<BirchType> operator*(vector<BirchType>aa, vector<BirchType>&bb){ _ASSERTE(aa.size() == bb.size()); for (int i = 0; i < aa.size(); i++) aa[i] *= bb[i]; return aa; } vector<BirchType> operator-(vector<BirchType>aa, vector<BirchType>&bb){ _ASSERTE(aa.size() == bb.size()); for (int i = 0; i < aa.size(); i++) aa[i] -= bb[i]; return aa; } vector<BirchType> operator*(vector<BirchType>aa, double k){ for (int i = 0; i < aa.size(); i++) aa[i] = double(aa[i])* k; return aa; } vector<BirchType> operator*(int k, vector<BirchType>aa){ for (int i = 0; i < aa.size(); i++) aa[i] *= k; return aa; } class birch { public: struct Attribute { unsigned int dim; vector<BirchType>data; Attribute(unsigned int d) :dim(d) { data.resize(dim); } }; struct CF { unsigned int N; vector<BirchType> LS; vector<BirchType> SS; CF(unsigned int N, vector<BirchType> LS, vector<BirchType>SS) :N(N), LS(LS), SS(SS){} /*CF(CF& cc){//shallow copy is enough this->N = cc.N; this->LS = cc.LS; this->SS = cc.SS; }*/ CF(unsigned int dim){ N = 0; LS.resize(dim); SS.resize(dim); }; CF(){}; }; struct Leaf; struct MinCluster { CF cf; Leaf*parent; MinCluster() { parent = NULL; } MinCluster(CF cf) { parent = NULL; this->cf = cf; } }; struct Leaf { Leaf*pre, *next;//to make up a leaf-list.for Nonleaf,NULL Leaf*parent; vector<Leaf*>*child;//对Leaf而言为NULL vector<MinCluster>*cluster;//对NonLeaf而言为NULL CF cf; Leaf() { parent = pre = next = NULL; child = NULL; cluster = NULL; } }; void generate_data(int num, int dim, vector<int>&span) { this->dim = dim; _ASSERTE(span.size() == dim); for (int i = 0; i < num; i++) { Attribute att(dim); for (int j = 0; j < dim; j++) att.data[j] = span[j] * double(rand()) / double(RAND_MAX + 1.0); dataset.push_back(att); } } vector<Attribute>dataset; int absorbnum; public: birch(unsigned int b, unsigned int l, unsigned int t) :B(b), L(l), T(t){ _ASSERTE(B > 2); _ASSERTE(L > 3); root = NULL; time_t tt; srand(time(&tt)); absorbnum = 0; } ~birch(); void insert(Attribute att); private: unsigned int B; //maximal num of child a Nonleaf will have unsigned int L;//maximal num of MinCluster a leaf will haveLeaf unsigned int T;// MinCluster的直径不能超过T Leaf*root; Leaf*head;//the head of the leaf-list at the bottom of the tree int dim; private: inline double lengthofvec(vector<BirchType>&aa){ double len = 0; for (int i = 0; i < aa.size(); i++) len += pow(aa[i], 2.0); return sqrt(len); } double sumofvec(vector<BirchType>&aa){ double sum = 0; for (int i = 0; i < aa.size(); i++) sum += aa[i]; return sum; } double cal_inter_cluster_dis(CF &cf1, CF &cf2); double cal_intra_cluster_dis(); double merge_cluster_diameter(CF &cf1, CF &cf2); vector<BirchType>updateSS(vector<BirchType>&LS, vector<BirchType>&SS) { for (int i = 0; i < LS.size(); i++) SS[i] += pow(LS[i], 2.0); return SS; } CF updateCF(CF &c1, CF &c2) { return CF(c1.N + c2.N, c1.LS + c2.LS, c1.SS + c2.SS); } void updateCF(Leaf*leaf) { CF cf(dim); if (leaf->cluster != NULL) { for (int i = 0; i < leaf->cluster->size(); i++) { cf.N = cf.N + (*leaf->cluster)[i].cf.N; cf.LS = cf.LS + (*leaf->cluster)[i].cf.LS; cf.SS = cf.SS + (*leaf->cluster)[i].cf.SS; } } else if (leaf->child != NULL) { for (int i = 0; i < leaf->child->size(); i++) { cf.N = cf.N + (*leaf->child)[i]->cf.N; cf.LS = cf.LS + (*leaf->child)[i]->cf.LS; cf.SS = cf.SS + (*leaf->child)[i]->cf.SS; } } leaf->cf = cf; } MinCluster create_mincluster(Attribute att) { vector<BirchType>aa; aa.resize(att.dim); return MinCluster(CF(1, att.data, updateSS(att.data, aa))); } void insert(Leaf*close, bool &split, MinCluster &clu); }; birch::~birch() { Leaf*plist = head; while (plist != NULL) { delete plist->cluster; plist = plist->next; } vector<Leaf*>aa, bb; aa.push_back(root); while (!aa.empty()) { Leaf*pleaf = aa.back(); aa.pop_back(); bb.push_back(pleaf); if (pleaf->child != NULL) aa.insert(aa.end(), pleaf->child->begin(), pleaf->child->end()); } for (int i = 0; i < bb.size(); i++) { if (bb[i]->child != NULL) delete bb[i]->child; delete bb[i]; } } /*double birch::merge_cluster_diameter(CF &cf1, CF &cf2) { return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N)) + cf2.SS *(1.0 / double(cf1.N)) - 2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N)))); }*/ double birch::merge_cluster_diameter(CF &cf1, CF &cf2) { return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N)) + cf2.SS *(1.0 / double(cf1.N)) - 2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N)))); } void birch::insert(Attribute att) { if (root == NULL) { root = new Leaf; root->cluster = new vector < MinCluster > ; (*root->cluster).push_back(create_mincluster(att)); root->cf = CF((*root->cluster)[0].cf); head = root; head->pre = NULL; head->next = NULL; return; } MinCluster clu = create_mincluster(att); Leaf*leaf = root; vector<int>path; while (leaf->cluster == NULL) { int k = -1; double mindis = 10000000000000; double dd; for (int i = 0; i < (*leaf->child).size(); i++) { double dis = cal_inter_cluster_dis(clu.cf, (*leaf->child)[i]->cf); if (dis < mindis) { mindis = dis; k = i; } dd = dis; } _ASSERTE(k >= 0); path.push_back(k); leaf = (*leaf->child)[k]; } int k = -1; //mindis = 100000; double mindis = 100000; for (int i = 0; i < (*leaf->cluster).size(); i++) { double dis = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[i].cf); if (dis < mindis) { mindis = dis; k = i; } _ASSERTE(k >= 0); } //double ttt = merge_cluster_diameter(clu.cf, (*leaf->cluster)[k].cf); double ttt = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[k].cf); if (ttt < T) { //absorb (*leaf->cluster)[k].cf = updateCF((*leaf->cluster)[k].cf, clu.cf); absorbnum++; } else { (*leaf->cluster).push_back(clu); } //update CF value along the path Leaf*lea = root; (*lea).cf = updateCF((*lea).cf, clu.cf); for (int i = 0; i < path.size(); i++) { (*lea->child)[path[i]]->cf = updateCF((*lea->child)[path[i]]->cf, clu.cf); lea = (*lea->child)[path[i]]; } if ((*leaf->cluster).size() > L) { double maxdis = 0; int th1 = -1; int th2 = -1; double**dismatrix = new double*[(*leaf->cluster).size()]; for (int i = 0; i < (*leaf->cluster).size(); i++) dismatrix[i] = new double[(*leaf->cluster).size()]; //找到距离最远的两个簇 for (int i = 0; i < (*leaf->cluster).size() - 1; i++) for (int h = i + 1; h < (*leaf->cluster).size(); h++) { double dis = cal_inter_cluster_dis((*leaf->cluster)[i].cf, (*leaf->cluster)[h].cf); dismatrix[i][h] = dis; dismatrix[h][i] = dis; if (dis > maxdis) { maxdis = dis; th1 = i; th2 = h; } } Leaf*new_leaf = new Leaf; new_leaf->cluster = new vector < MinCluster > ; new_leaf->cluster->push_back((*leaf->cluster)[th2]); int len = (*leaf->cluster).size(); (*leaf->cluster)[th2].parent = new_leaf; //根据各簇与两个新簇的距离分配到两个新簇中 for (int i = 0; i < len; i++) { if (i == th1 || i == th2) continue; if (dismatrix[i][th2] < dismatrix[i][th1]) { (*leaf->cluster)[i].parent = new_leaf; new_leaf->cluster->push_back((*leaf->cluster)[i]); } } for (int i = 0; i < (*leaf->cluster).size(); i++) delete[] dismatrix[i]; delete[]dismatrix; vector < MinCluster >::iterator it, it1; it = (*leaf->cluster).begin(); while (it != (*leaf->cluster).end()) { if (it->parent == new_leaf) it = (*leaf->cluster).erase(it); else { it++; } } //不要忘了更新leaf和new_leaf的cf值 updateCF(leaf); updateCF(new_leaf); //不要忘了将new_leaf加入到链表中 Leaf*next = leaf->next; leaf->next = new_leaf; new_leaf->pre = leaf; new_leaf->next = next; if (next) next->pre = new_leaf; if (leaf->parent != NULL) { leaf->parent->child->push_back(new_leaf); new_leaf->parent = leaf->parent; } else//leaf is root,then a new root should be created { Leaf*new_root = new Leaf; new_root->child = new vector < Leaf* > ; new_root->child->push_back(leaf); new_root->child->push_back(new_leaf); leaf->parent = new_root; new_leaf->parent = new_root; updateCF(new_root); root = new_root; return; } } Leaf*cur = leaf->parent; while (cur != NULL&&cur->child->size() > B) { double maxdis = 0; int th1 = -1; int th2 = -1; double**dismatrix = new double*[cur->child->size()]; for (int i = 0; i < cur->child->size(); i++) dismatrix[i] = new double[cur->child->size()]; //找到距离最远的两个leaf for (int i = 0; i < cur->child->size() - 1; i++) for (int h = i + 1; h < cur->child->size(); h++) { double dis = cal_inter_cluster_dis((*cur->child)[i]->cf, (*cur->child)[h]->cf); dismatrix[i][h] = dis; dismatrix[h][i] = dis; if (dis > maxdis) { maxdis = dis; th1 = i; th2 = h; } } Leaf*new_leaf1 = new Leaf; new_leaf1->child = new vector < Leaf* > ; (*cur->child)[th2]->parent = new_leaf1; (*new_leaf1->child).push_back((*cur->child)[th2]); int len = (*cur->child).size(); //rearrange other leaves to th1 th2 as their child for (int i = 0; i < len; i++) { if (i == th1 || i == th2) continue; if (dismatrix[i][th2] < dismatrix[i][th1]) { (*cur->child)[i]->parent = new_leaf1; new_leaf1->child->push_back((*cur->child)[i]); } } for (int i = 0; i < (*cur->child).size(); i++) delete[] dismatrix[i]; delete[]dismatrix; vector < Leaf* >::iterator it; it = (*cur->child).begin(); while (it != (*cur->child).end()) { if ((*it)->parent == new_leaf1) it = (*cur->child).erase(it); else it++; } //不要忘了更新cur和new_leaf1的cf值 updateCF(cur); updateCF(new_leaf1); //if cur is root,then a new root should be created if (cur->parent == NULL) { Leaf*new_root = new Leaf; new_root->child = new vector < Leaf* > ; new_root->child->push_back(cur); new_root->child->push_back(new_leaf1); cur->parent = new_root; new_leaf1->parent = new_root; updateCF(new_root); root = new_root; return; } //cur is not root //不要忘了将new_leaf1加入cur的父亲节点的child cur->parent->child->push_back(new_leaf1); new_leaf1->parent = cur->parent; cur = cur->parent; } } //根据CF值计算簇间距离 /*double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2) { return sqrt(sumofvec((2 * (cf1.N + cf2.N)*(cf1.SS + cf2.SS) - 2 * (cf1.LS + cf2.LS)*(cf1.LS + cf2.LS))* (1.0 / double(cf1.N + cf2.N)*(cf1.N + cf2.N - 1)))); }*/ double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2) { double dis = 0; double temp; for (int i = 0; i < dim; i++) { double t1 = double(cf1.LS[i]) / double(cf1.N); double t2 = double(cf2.LS[i]) / double(cf2.N); temp = t1 - t2; dis += temp*temp; } return sqrt(dis); } int _tmain(int argc, _TCHAR* argv[]) { //vector<int*>aa, bb; //int *p1 = new int; //int *p2 = new int; //int *p3 = new int; //*p1 = 8; //*p2 = 9; //*p3 = 88; //aa.push_back(p1); //aa.push_back(p2); //aa.push_back(p3); //*aa[2] = 999; //bb.push_back(p3); //vector<int*>::iterator it = aa.begin() + 1; ////delete aa[0]; //it = aa.erase(it); //cout << *bb[0] << endl; //cout << **it << endl; //for (it = aa.begin(); it != aa.end(); it++) // cout << **it << endl; birch bir(5, 6, 20); int dim = 2; int num = 1000; vector<int>span; for (int i = 0; i < dim; i++) span.push_back(1000); bir.generate_data(num, dim, span); for (int i = 0; i < num; i++) bir.insert(bir.dataset[i]); cout << bir.absorbnum << endl; system("pause"); return 0; }