Hierarchical cluster算法介绍

  突然想记录几个聚类算法,由于实力有限就先介绍一下层次聚类算法(Hierarchical cluster algorithm),这个聚类算法思想简单,但实现起来感觉复杂度挺大;以前看过《集体智慧编程》里介绍过,里面是用python实现的,由于python里面的列表和字典用起来方便,故实现该算法还行;这里我用c++重新写了一下,感觉代码蛮臃肿,可能是自己的c++没有学习好吧!!!对于容器的使用还不够熟练,这里贴出来的目的是希望哪位大牛看到了指导一二,这里感激不尽。废话不多说了,进入正题吧!

************************************************************************************************************

Hierarchical cluster Algorithm的大致介绍

  层次聚类算法有两种实现思想,一种是初始时将每个待聚类的数据样本视为一个cluster,采用合并的方式,每次合并两个"距离"最近的cluster,直到合并成一个cluster为止(当然可以在达到自己设定想得到的cluster个数时终止迭代);另一种刚好与第一种相反,初始时将所有的数据样本视为一个cluster,采用分解的方式(这里没有实现就不说太多)。

************************************************************************************************************

算法的步骤及相关问题

  算法步骤:  (1)初始时,将每个数据样本视为一个cluster(选取一个度量两个cluster距离的方式),

       (2)计算任意两个cluster之间的距离;每次选取距离最小的两个cluster,

       (3)合并(2)中选择的两个cluster,将合并产生的新cluster加入cluster set中,并删除被合并的两个cluster,

       (4)重复(2)(3),知道cluster set中元素只剩下一个为止。

  相关问题: (1)度量两个cluster之间的距离,应该选择哪种距离???《集体智慧编程》中选择的是Pearson,当然也可以直接选用欧氏距离

        (2)如何合并两个cluster,即新的cluster对应的属性值如何表示???这里是用被合并的两个cluster的平均值表示新的cluster

******************************************************************************************************************

  1 /**

  2 ** Hierarchical cluster Algorithm

  3 ** step:(1)Firstly,regard each sample as a cluster, and

  4          (2)Each time merge two clusters if the distance between them is lowest.

  5          (3)then add the new cluster into cluster set, and delete two clusters merged from cluster set.

  6 ** method: (1)as to merging, here replace the old two clusters with their average;

  7            (2)measure the distance with the Pearson similarity.

  8 ** Time:2013/7/10 

  9 **/

 10 #include <iostream>

 11 #include <map>

 12 #include <vector>

 13 #include <string>

 14 #include <fstream> 

 15 #include <cstring>

 16 #include <sstream> 

 17 #include <cmath>

 18 #include <iterator>

 19 using namespace std;

 20 //cluster

 21 typedef    struct bicluster{

 22     vector<double> attri;//attribute

 23     int  cid;//cluster id 

 24 }Bicluster;

 25 //a pair

 26 typedef struct lowpair{

 27     int leftid;

 28     int rightid;

 29     double dist;

 30 }Lpair;

 31 

 32 /*****************************************************************

 33 ** convert string(char*) to double(or other type)

 34 ** here should be included <sstream> before using the stringstream

 35 ******************************************************************/

 36 double str2double(char* str){

 37     stringstream ss;

 38     ss << str;

 39     double tmp;

 40     ss >> tmp;

 41     return tmp;    

 42 }

 43 /*****************************************************************

 44 ** split the string containing some special tokens

 45 ******************************************************************/

 46 string split(string &str, vector<double>& dvec, const char* tok){

 47     char *pch = NULL;

 48     pch = strtok(const_cast<char*>(str.c_str()), tok);

 49     string stmp(pch);

 50     while( pch != NULL ){

 51         pch = strtok(NULL, tok);

 52         if( !pch )

 53             break;

 54         dvec.push_back(str2double(pch));

 55     }

 56     return stmp;

 57 }

 58 /******************************************************************

 59 ** read data from 'blogdata.txt'

 60 ** @is ------- a reference to ifstream object(input)

 61 ** @data ----- a map used to store the data (output)

 62 ******************************************************************/

 63 bool readfile(ifstream &is, map<string, vector<double> >& mydata){

 64     if( is.fail() ){

 65         cerr << "can't open the file !!!" << endl;

 66         return false;

 67     }

 68     //ignore the first line of file

 69     string str;

 70     getline(is, str);

 71     

 72     //store the data read from file into mydata 

 73     while( !is.eof() ){

 74         vector<double> dtmp;

 75         string tmp;

 76         getline(is, str);

 77         tmp = split(str, dtmp, "\t");

 78         mydata.insert(pair<string,vector<double> >(tmp, dtmp));

 79     }

 80     return true;         

 81 }

 82 /*****************************************************************

 83 ** compute the distance between two clusters

 84 ** Note that Pearson value devotes to the similarity between 

 85     two clusters, that is, the greater the Pearson value, the 

 86     lower the distance between them.

 87 *****************************************************************/ 

 88 double distPearson(vector<double>& left, vector<double>& right){

 89     double sum1 = 0;

 90     double sum2 = 0;

 91     int len = left.size();

 92     for(int i=0; i<len; ++i){

 93         sum1 += left[i];

 94         sum2 += right[i];

 95     }

 96     

 97     /**

 98     ** maybe you will feel it's complex, 

 99     **  and here we could replace Pearson with Euclidean distance

100     **/

101     double sum1Sq = 0;

102     double sum2Sq = 0;

103     for(int j=0; j<len; ++j){

104         sum1Sq += pow(left[j], 2);

105         sum2Sq += pow(right[j], 2);

106     }

107     

108     double pSum = 0, num, den;

109     for(int k=0; k<len; ++k)

110         pSum += left[k]*right[k];

111     num = pSum - sum1*sum2 / len;

112     den = sqrt((sum1Sq - pow(sum1,2)/len) * (sum1Sq - pow(sum2,2)/len));

113     if( den == 0 )

114         return 0;

115     return 1.0 - num/den;

116 }

117 /*************************************************************

118 ** Given two clusters, the distance between them 

119     should be checked whether it exists before compute it.

120 **************************************************************/

121 bool isExist(vector<Lpair> &lp, int leftid, int rightid, double &d){

122     vector<Lpair>::iterator it = lp.begin();

123     for(; it!=lp.end(); ++it){

124         if( (it->leftid==leftid) && (it->rightid==rightid) ){

125             d = it->dist;//if the distance has been computed, assign its value to d

126             return true;

127         }        

128     }

129     d = 0;

130     return false;

131 }

132 /*************************************************************

133 ** Given a cluster's id, delete the cluster from cluster set

134 **************************************************************/

135 void Del(vector<Bicluster> &cvec, int clusterid){

136     vector<Bicluster>::iterator it = cvec.begin();

137     for(; it!=cvec.end(); ++it){

138         if( it->cid == clusterid )

139             break;

140     }

141     cvec.erase(it);

142 } 

143 /*************************************************************

144 ** Hierarchical Cluster Algorithm

145 **************************************************************/

146 void HierarchicalCluster(map<string, vector<double> > &mydata){

147     vector<Lpair> distances;//used to store the distance

148      

149     //firstly,regard each sample as a cluster

150     vector<Bicluster> cvec;

151     map<string, vector<double> >::iterator it = mydata.begin();

152     int myid = 0;

153     for(; it!= mydata.end(); ++it){

154         Bicluster btmp;

155         btmp.attri = it->second;

156         btmp.cid = myid++;

157         cvec.push_back(btmp);

158     } 

159     myid = -1;

160     //search the pair

161     while( cvec.size()>1 ){

162         Lpair lowp;

163         double closedis = distPearson(cvec[0].attri,cvec[1].attri);

164         lowp.leftid = cvec[0].cid, lowp.rightid = cvec[1].cid;

165         lowp.dist = closedis;

166         

167         int leftps = 0, rightps = 1;

168         for(int ix=0; ix<cvec.size(); ++ix){

169             for(int iy=ix+1; iy<cvec.size(); ++iy){

170                 double d;

171                 int lid = cvec[ix].cid, rid = cvec[iy].cid;

172                 if( !isExist(distances,lid,rid,d) ){

173                     Lpair lptmp;

174                     lptmp.dist = distPearson(cvec[ix].attri, cvec[iy].attri);

175                     lptmp.leftid = lid;

176                     lptmp.rightid= rid;

177                     distances.push_back(lptmp);

178                     d = lptmp.dist;

179                   } 

180                  if( d < lowp.dist ){

181                      lowp.leftid = lid;

182                      lowp.rightid = rid;

183                      leftps = ix;

184                      rightps = iy;

185                      lowp.dist = d;

186                  }

187             }

188         }

189         //create a new cluster

190         Bicluster ncluster;

191         for(int i=0; i<cvec[0].attri.size(); ++i){

192             double av;

193             av = (cvec[leftps].attri[i] + cvec[rightps].attri[i]) / 2.0;

194             ncluster.attri.push_back(av);

195         }

196         ncluster.cid = myid--;//assign negative to the new cluster's id

197         cout << "leftid: " << lowp.leftid <<  ", rightid: " << lowp.rightid << endl;

198         //delete the pair

199         Del(cvec, lowp.leftid); 

200         Del(cvec, lowp.rightid);

201         cvec.push_back(ncluster);

202     } 

203 } 

204 int main()

205 {

206     ifstream is("blogdata.txt");

207     if( is.fail() ){

208         cerr << "error!!!" << endl;

209         exit(-1);

210     }

211     map<string, vector<double> > mydata;

212     if(readfile(is, mydata))

213         HierarchicalCluster(mydata);

214     return 0;

215 }

  代码写的有点乱且复杂,最后显示的结果不是树状图(python很易实现),只是简单的显示了每次被合并的两个cluster的id.代码中用到的数据可以从http://kiwitobes.com/clusters/blog.txt下载得到。

你可能感兴趣的:(cluster)