突然想记录几个聚类算法,由于实力有限就先介绍一下层次聚类算法(Hierarchical cluster algorithm),这个聚类算法思想简单,但实现起来感觉复杂度挺大;以前看过《集体智慧编程》里介绍过,里面是用python实现的,由于python里面的列表和字典用起来方便,故实现该算法还行;这里我用c++重新写了一下,感觉代码蛮臃肿,可能是自己的c++没有学习好吧!!!对于容器的使用还不够熟练,这里贴出来的目的是希望哪位大牛看到了指导一二,这里感激不尽。废话不多说了,进入正题吧!
************************************************************************************************************
Hierarchical cluster Algorithm的大致介绍
层次聚类算法有两种实现思想,一种是初始时将每个待聚类的数据样本视为一个cluster,采用合并的方式,每次合并两个"距离"最近的cluster,直到合并成一个cluster为止(当然可以在达到自己设定想得到的cluster个数时终止迭代);另一种刚好与第一种相反,初始时将所有的数据样本视为一个cluster,采用分解的方式(这里没有实现就不说太多)。
************************************************************************************************************
算法的步骤及相关问题
算法步骤: (1)初始时,将每个数据样本视为一个cluster(选取一个度量两个cluster距离的方式),
(2)计算任意两个cluster之间的距离;每次选取距离最小的两个cluster,
(3)合并(2)中选择的两个cluster,将合并产生的新cluster加入cluster set中,并删除被合并的两个cluster,
(4)重复(2)(3),知道cluster set中元素只剩下一个为止。
相关问题: (1)度量两个cluster之间的距离,应该选择哪种距离???《集体智慧编程》中选择的是Pearson,当然也可以直接选用欧氏距离
(2)如何合并两个cluster,即新的cluster对应的属性值如何表示???这里是用被合并的两个cluster的平均值表示新的cluster
******************************************************************************************************************
1 /** 2 ** Hierarchical cluster Algorithm 3 ** step:(1)Firstly,regard each sample as a cluster, and 4 (2)Each time merge two clusters if the distance between them is lowest. 5 (3)then add the new cluster into cluster set, and delete two clusters merged from cluster set. 6 ** method: (1)as to merging, here replace the old two clusters with their average; 7 (2)measure the distance with the Pearson similarity. 8 ** Time:2013/7/10 9 **/ 10 #include <iostream> 11 #include <map> 12 #include <vector> 13 #include <string> 14 #include <fstream> 15 #include <cstring> 16 #include <sstream> 17 #include <cmath> 18 #include <iterator> 19 using namespace std; 20 //cluster 21 typedef struct bicluster{ 22 vector<double> attri;//attribute 23 int cid;//cluster id 24 }Bicluster; 25 //a pair 26 typedef struct lowpair{ 27 int leftid; 28 int rightid; 29 double dist; 30 }Lpair; 31 32 /***************************************************************** 33 ** convert string(char*) to double(or other type) 34 ** here should be included <sstream> before using the stringstream 35 ******************************************************************/ 36 double str2double(char* str){ 37 stringstream ss; 38 ss << str; 39 double tmp; 40 ss >> tmp; 41 return tmp; 42 } 43 /***************************************************************** 44 ** split the string containing some special tokens 45 ******************************************************************/ 46 string split(string &str, vector<double>& dvec, const char* tok){ 47 char *pch = NULL; 48 pch = strtok(const_cast<char*>(str.c_str()), tok); 49 string stmp(pch); 50 while( pch != NULL ){ 51 pch = strtok(NULL, tok); 52 if( !pch ) 53 break; 54 dvec.push_back(str2double(pch)); 55 } 56 return stmp; 57 } 58 /****************************************************************** 59 ** read data from 'blogdata.txt' 60 ** @is ------- a reference to ifstream object(input) 61 ** @data ----- a map used to store the data (output) 62 ******************************************************************/ 63 bool readfile(ifstream &is, map<string, vector<double> >& mydata){ 64 if( is.fail() ){ 65 cerr << "can't open the file !!!" << endl; 66 return false; 67 } 68 //ignore the first line of file 69 string str; 70 getline(is, str); 71 72 //store the data read from file into mydata 73 while( !is.eof() ){ 74 vector<double> dtmp; 75 string tmp; 76 getline(is, str); 77 tmp = split(str, dtmp, "\t"); 78 mydata.insert(pair<string,vector<double> >(tmp, dtmp)); 79 } 80 return true; 81 } 82 /***************************************************************** 83 ** compute the distance between two clusters 84 ** Note that Pearson value devotes to the similarity between 85 two clusters, that is, the greater the Pearson value, the 86 lower the distance between them. 87 *****************************************************************/ 88 double distPearson(vector<double>& left, vector<double>& right){ 89 double sum1 = 0; 90 double sum2 = 0; 91 int len = left.size(); 92 for(int i=0; i<len; ++i){ 93 sum1 += left[i]; 94 sum2 += right[i]; 95 } 96 97 /** 98 ** maybe you will feel it's complex, 99 ** and here we could replace Pearson with Euclidean distance 100 **/ 101 double sum1Sq = 0; 102 double sum2Sq = 0; 103 for(int j=0; j<len; ++j){ 104 sum1Sq += pow(left[j], 2); 105 sum2Sq += pow(right[j], 2); 106 } 107 108 double pSum = 0, num, den; 109 for(int k=0; k<len; ++k) 110 pSum += left[k]*right[k]; 111 num = pSum - sum1*sum2 / len; 112 den = sqrt((sum1Sq - pow(sum1,2)/len) * (sum1Sq - pow(sum2,2)/len)); 113 if( den == 0 ) 114 return 0; 115 return 1.0 - num/den; 116 } 117 /************************************************************* 118 ** Given two clusters, the distance between them 119 should be checked whether it exists before compute it. 120 **************************************************************/ 121 bool isExist(vector<Lpair> &lp, int leftid, int rightid, double &d){ 122 vector<Lpair>::iterator it = lp.begin(); 123 for(; it!=lp.end(); ++it){ 124 if( (it->leftid==leftid) && (it->rightid==rightid) ){ 125 d = it->dist;//if the distance has been computed, assign its value to d 126 return true; 127 } 128 } 129 d = 0; 130 return false; 131 } 132 /************************************************************* 133 ** Given a cluster's id, delete the cluster from cluster set 134 **************************************************************/ 135 void Del(vector<Bicluster> &cvec, int clusterid){ 136 vector<Bicluster>::iterator it = cvec.begin(); 137 for(; it!=cvec.end(); ++it){ 138 if( it->cid == clusterid ) 139 break; 140 } 141 cvec.erase(it); 142 } 143 /************************************************************* 144 ** Hierarchical Cluster Algorithm 145 **************************************************************/ 146 void HierarchicalCluster(map<string, vector<double> > &mydata){ 147 vector<Lpair> distances;//used to store the distance 148 149 //firstly,regard each sample as a cluster 150 vector<Bicluster> cvec; 151 map<string, vector<double> >::iterator it = mydata.begin(); 152 int myid = 0; 153 for(; it!= mydata.end(); ++it){ 154 Bicluster btmp; 155 btmp.attri = it->second; 156 btmp.cid = myid++; 157 cvec.push_back(btmp); 158 } 159 myid = -1; 160 //search the pair 161 while( cvec.size()>1 ){ 162 Lpair lowp; 163 double closedis = distPearson(cvec[0].attri,cvec[1].attri); 164 lowp.leftid = cvec[0].cid, lowp.rightid = cvec[1].cid; 165 lowp.dist = closedis; 166 167 int leftps = 0, rightps = 1; 168 for(int ix=0; ix<cvec.size(); ++ix){ 169 for(int iy=ix+1; iy<cvec.size(); ++iy){ 170 double d; 171 int lid = cvec[ix].cid, rid = cvec[iy].cid; 172 if( !isExist(distances,lid,rid,d) ){ 173 Lpair lptmp; 174 lptmp.dist = distPearson(cvec[ix].attri, cvec[iy].attri); 175 lptmp.leftid = lid; 176 lptmp.rightid= rid; 177 distances.push_back(lptmp); 178 d = lptmp.dist; 179 } 180 if( d < lowp.dist ){ 181 lowp.leftid = lid; 182 lowp.rightid = rid; 183 leftps = ix; 184 rightps = iy; 185 lowp.dist = d; 186 } 187 } 188 } 189 //create a new cluster 190 Bicluster ncluster; 191 for(int i=0; i<cvec[0].attri.size(); ++i){ 192 double av; 193 av = (cvec[leftps].attri[i] + cvec[rightps].attri[i]) / 2.0; 194 ncluster.attri.push_back(av); 195 } 196 ncluster.cid = myid--;//assign negative to the new cluster's id 197 cout << "leftid: " << lowp.leftid << ", rightid: " << lowp.rightid << endl; 198 //delete the pair 199 Del(cvec, lowp.leftid); 200 Del(cvec, lowp.rightid); 201 cvec.push_back(ncluster); 202 } 203 } 204 int main() 205 { 206 ifstream is("blogdata.txt"); 207 if( is.fail() ){ 208 cerr << "error!!!" << endl; 209 exit(-1); 210 } 211 map<string, vector<double> > mydata; 212 if(readfile(is, mydata)) 213 HierarchicalCluster(mydata); 214 return 0; 215 }
代码写的有点乱且复杂,最后显示的结果不是树状图(python很易实现),只是简单的显示了每次被合并的两个cluster的id.代码中用到的数据可以从http://kiwitobes.com/clusters/blog.txt下载得到。