一、引言
K-Means算法是聚类算法中,应用最为广泛的一种。本文基于欧几里得距离公式:d = sqrt((x1-x2)^+(y1-y2)^)计算二维向量间的距离,作为聚类划分的依据,输入数据为二维数据两列数据,输出结果为聚类中心和元素划分结果。输入数据格式如下:
1 18 2 2 3 2 4 0.0 0.0 5 1.0 0.0 6 0.0 1.0 7 2.0 1.0 8 1.0 2.0 9 2.0 2.0 10 2.0 0.0 11 0.0 2.0 12 7.0 6.0 13 7.0 7.0 14 7.0 8.0 15 8.0 6.0 16 8.0 7.0 17 8.0 8.0 18 8.0 9.0 19 9.0 7.0 20 9.0 8.0 21 9.0 9.0 22
二、欧几里得距离:
1 /**************************************************************************** 2 * * 3 * KMEANS * 4 * * 5 *****************************************************************************/ 6 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <conio.h> 11 #include <math.h> 12 13 // FUNCTION PROTOTYPES 14 15 16 // DEFINES 17 #define SUCCESS 1 18 #define FAILURE 0 19 #define TRUE 1 20 #define FALSE 0 21 #define MAXVECTDIM 20 22 #define MAXPATTERN 20 23 #define MAXCLUSTER 10 24 25 26 27 28 29 char *f2a(double x, int width){ 30 char cbuf[255]; 31 char *cp; 32 int i,k; 33 int d,s; 34 cp=fcvt(x,width,&d,&s); 35 if (s) { 36 strcpy(cbuf,"-"); 37 } 38 else { 39 strcpy(cbuf," "); 40 } /* endif */ 41 if (d>0) { 42 for (i=0; i<d; i++) { 43 cbuf[i+1]=cp[i]; 44 } /* endfor */ 45 cbuf[d+1]=0; 46 cp+=d; 47 strcat(cbuf,"."); 48 strcat(cbuf,cp); 49 } else { 50 if (d==0) { 51 strcat(cbuf,"."); 52 strcat(cbuf,cp); 53 } 54 else { 55 k=-d; 56 strcat(cbuf,"."); 57 for (i=0; i<k; i++) { 58 strcat(cbuf,"0"); 59 } /* endfor */ 60 strcat(cbuf,cp); 61 } /* endif */ 62 } /* endif */ 63 cp=&cbuf[0]; 64 return cp; 65 } 66 67 68 69 70 // ***** Defined structures & classes ***** 71 struct aCluster { 72 double Center[MAXVECTDIM]; 73 int Member[MAXPATTERN]; //Index of Vectors belonging to this cluster 74 int NumMembers; 75 }; 76 77 struct aVector { 78 double Center[MAXVECTDIM]; 79 int Size; 80 }; 81 82 class System { 83 private: 84 double Pattern[MAXPATTERN][MAXVECTDIM+1]; 85 aCluster Cluster[MAXCLUSTER]; 86 int NumPatterns; // Number of patterns 87 int SizeVector; // Number of dimensions in vector 88 int NumClusters; // Number of clusters 89 void DistributeSamples(); // Step 2 of K-means algorithm 90 int CalcNewClustCenters();// Step 3 of K-means algorithm 91 double EucNorm(int, int); // Calc Euclidean norm vector 92 int FindClosestCluster(int); //ret indx of clust closest to pattern 93 //whose index is arg 94 public: 95 void system(); 96 int LoadPatterns(char *fname); // Get pattern data to be clustered 97 void InitClusters(); // Step 1 of K-means algorithm 98 void RunKMeans(); // Overall control K-means process 99 void ShowClusters(); // Show results on screen 100 void SaveClusters(char *fname); // Save results to file 101 void ShowCenters(); 102 }; 103 //输出聚类中心 104 void System::ShowCenters(){ 105 int i,j; 106 printf("Cluster centers:\n"); 107 for (i=0; i<NumClusters; i++) { 108 Cluster[i].Member[0]=i; 109 printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[0],Cluster[i].Center[1]); 110 } /* endfor */ 111 printf("\n"); 112 getchar(); 113 } 114 115 //读取文件 116 int System::LoadPatterns(char *fname) 117 { 118 FILE *InFilePtr; 119 int i,j; 120 double x; 121 if((InFilePtr = fopen(fname, "r")) == NULL) 122 return FAILURE; 123 fscanf(InFilePtr, "%d", &NumPatterns); // Read # of patterns 18数据量 124 fscanf(InFilePtr, "%d", &SizeVector); // Read dimension of vector 2维度 125 fscanf(InFilePtr, "%d", &NumClusters); // Read # of clusters for K-Means 2簇 126 for (i=0; i<NumPatterns; i++) { // For each vector 127 for (j=0; j<SizeVector; j++) { // create a pattern 128 fscanf(InFilePtr,"%lg",&x); // consisting of all elements 129 Pattern[i][j]=x; 130 } /* endfor */ 131 } /* endfor */ 132 //输出所有数据元素 133 printf("Input patterns:\n"); 134 for (i=0; i<NumPatterns; i++) { 135 printf("Pattern[%d]=(%2.3f,%2.3f)\n",i,Pattern[i][0],Pattern[i][1]); 136 } /* endfor */ 137 printf("\n--------------------\n"); 138 getchar(); 139 return SUCCESS; 140 } 141 //*************************************************************************** 142 // InitClusters * 143 // Arbitrarily assign a vector to each of the K clusters * 144 // We choose the first K vectors to do this * 145 //*************************************************************************** 146 //初始化聚类中心 147 void System::InitClusters(){ 148 int i,j; 149 printf("Initial cluster centers:\n"); 150 for (i=0; i<NumClusters; i++) { 151 Cluster[i].Member[0]=i; 152 for (j=0; j<SizeVector; j++) { 153 Cluster[i].Center[j]=Pattern[i][j]; 154 } /* endfor */ 155 } /* endfor */ 156 for (i=0; i<NumClusters; i++) { 157 printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[0],Cluster[i].Center[1]); //untransplant 158 } /* endfor */ 159 printf("\n"); 160 getchar(); 161 } 162 //运行KMeans 163 void System::RunKMeans(){ 164 int converged; 165 int pass; 166 pass=1; 167 converged=FALSE; 168 //第N次聚类 169 while (converged==FALSE) { 170 printf("PASS=%d\n",pass++); 171 DistributeSamples(); 172 converged=CalcNewClustCenters(); 173 ShowCenters(); 174 getchar(); 175 } /* endwhile */ 176 } 177 //在二维和三维空间中的欧式距离的就是两点之间的距离,二维的公式是 178 //d = sqrt((x1-x2)^+(y1-y2)^) 179 //通过这种运算,就可以把所有列的属性都纳入进来 180 double System::EucNorm(int p, int c){ // Calc Euclidean norm of vector difference 181 double dist,x; // between pattern vector, p, and cluster 182 int i; // center, c. 183 char zout[128]; 184 char znum[40]; 185 char *pnum; 186 // 187 pnum=&znum[0]; 188 strcpy(zout,"d=sqrt("); 189 printf("The distance from pattern %d to cluster %d is calculated as:\n",p,c); 190 dist=0; 191 for (i=0; i<SizeVector ;i++){ 192 //拼写字符串 193 x=(Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]); 194 strcat(zout,f2a(x,4)); 195 if (i==0) 196 strcat(zout,"+"); 197 //计算距离 198 dist += (Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]); 199 } /* endfor */ 200 printf("%s)\n",zout); 201 return dist; 202 } 203 //查找最近的群集 204 int System::FindClosestCluster(int pat){ 205 int i, ClustID; 206 double MinDist, d; 207 MinDist =9.9e+99; 208 ClustID=-1; 209 for (i=0; i<NumClusters; i++) { 210 d=EucNorm(pat,i); 211 printf("Distance from pattern %d to cluster %d is %f\n\n",pat,i,sqrt(d)); 212 if (d<MinDist) { 213 MinDist=d; 214 ClustID=i; 215 } /* endif */ 216 } /* endfor */ 217 if (ClustID<0) { 218 printf("Aaargh"); 219 exit(0); 220 } /* endif */ 221 return ClustID; 222 } 223 // 224 void System::DistributeSamples(){ 225 int i,pat,Clustid,MemberIndex; 226 //Clear membership list for all current clusters 227 for (i=0; i<NumClusters;i++){ 228 Cluster[i].NumMembers=0; 229 } 230 for (pat=0; pat<NumPatterns; pat++) { 231 //Find cluster center to which the pattern is closest 232 Clustid= FindClosestCluster(pat);//查找最近的聚类中心 233 printf("patern %d assigned to cluster %d\n\n",pat,Clustid); 234 //post this pattern to the cluster 235 MemberIndex=Cluster[Clustid].NumMembers; 236 Cluster[Clustid].Member[MemberIndex]=pat; 237 Cluster[Clustid].NumMembers++; 238 } /* endfor */ 239 } 240 //计算新的群集中心 241 int System::CalcNewClustCenters(){ 242 int ConvFlag,VectID,i,j,k; 243 double tmp[MAXVECTDIM]; 244 char xs[255]; 245 char ys[255]; 246 char nc1[20]; 247 char nc2[20]; 248 char *pnc1; 249 char *pnc2; 250 char *fpv; 251 252 pnc1=&nc1[0]; 253 pnc2=&nc2[0]; 254 ConvFlag=TRUE; 255 printf("The new cluster centers are now calculated as:\n"); 256 for (i=0; i<NumClusters; i++) { //for each cluster 257 pnc1=itoa(Cluster[i].NumMembers,nc1,10); 258 pnc2=itoa(i,nc2,10); 259 strcpy(xs,"Cluster Center"); 260 strcat(xs,nc2); 261 strcat(xs,"(1/"); 262 strcpy(ys,"(1/"); 263 strcat(xs,nc1); 264 strcat(ys,nc1); 265 strcat(xs,")("); 266 strcat(ys,")("); 267 for (j=0; j<SizeVector; j++) { // clear workspace 268 tmp[j]=0.0; 269 } /* endfor */ 270 for (j=0; j<Cluster[i].NumMembers; j++) { //traverse member vectors 271 VectID=Cluster[i].Member[j]; 272 for (k=0; k<SizeVector; k++) { //traverse elements of vector 273 tmp[k] += Pattern[VectID][k]; // add (member) pattern elmnt into temp 274 if (k==0) { 275 strcat(xs,f2a(Pattern[VectID][k],3)); 276 } else { 277 strcat(ys,f2a(Pattern[VectID][k],3)); 278 } /* endif */ 279 } /* endfor */ 280 if(j<Cluster[i].NumMembers-1){ 281 strcat(xs,"+"); 282 strcat(ys,"+"); 283 } 284 else { 285 strcat(xs,")"); 286 strcat(ys,")"); 287 } 288 } /* endfor */ 289 for (k=0; k<SizeVector; k++) { //traverse elements of vector 290 tmp[k]=tmp[k]/Cluster[i].NumMembers; 291 if (tmp[k] != Cluster[i].Center[k]) 292 ConvFlag=FALSE; 293 Cluster[i].Center[k]=tmp[k]; 294 } /* endfor */ 295 printf("%s,\n",xs); 296 printf("%s\n",ys); 297 } /* endfor */ 298 return ConvFlag; 299 } 300 //输出聚类 301 void System::ShowClusters(){ 302 int cl; 303 for (cl=0; cl<NumClusters; cl++) { 304 printf("\nCLUSTER %d ==>[%f,%f]\n", cl,Cluster[cl].Center[0],Cluster[cl].Center[1]); 305 } /* endfor */ 306 } 307 308 void System::SaveClusters(char *fname){ 309 }
四、主调程序
1 void main(int argc, char *argv[]) 2 { 3 4 System kmeans; 5 /* 6 if (argc<2) { 7 printf("USAGE: KMEANS PATTERN_FILE\n"); 8 exit(0); 9 }*/ 10 if (kmeans.LoadPatterns("KM2.DAT")==FAILURE ){ 11 printf("UNABLE TO READ PATTERN_FILE:%s\n",argv[1]); 12 exit(0); 13 } 14 15 kmeans.InitClusters(); 16 kmeans.RunKMeans(); 17 kmeans.ShowClusters(); 18 }
五、输出结果
1 Input patterns: 2 Pattern[0]=(0.000,0.000) 3 Pattern[1]=(1.000,0.000) 4 Pattern[2]=(0.000,1.000) 5 Pattern[3]=(2.000,1.000) 6 Pattern[4]=(1.000,2.000) 7 Pattern[5]=(2.000,2.000) 8 Pattern[6]=(2.000,0.000) 9 Pattern[7]=(0.000,2.000) 10 Pattern[8]=(7.000,6.000) 11 Pattern[9]=(7.000,7.000) 12 Pattern[10]=(7.000,8.000) 13 Pattern[11]=(8.000,6.000) 14 Pattern[12]=(8.000,7.000) 15 Pattern[13]=(8.000,8.000) 16 Pattern[14]=(8.000,9.000) 17 Pattern[15]=(9.000,7.000) 18 Pattern[16]=(9.000,8.000) 19 Pattern[17]=(9.000,9.000) 20 21 -------------------- 22 23 Initial cluster centers: 24 ClusterCenter[0]=(0.000000,0.000000) 25 ClusterCenter[1]=(1.000000,0.000000) 26 27 28 PASS=1 29 The distance from pattern 0 to cluster 0 is calculated as: 30 d=sqrt( .0000+ .0000) 31 Distance from pattern 0 to cluster 0 is 0.000000 32 33 The distance from pattern 0 to cluster 1 is calculated as: 34 d=sqrt( 1.0000+ .0000) 35 Distance from pattern 0 to cluster 1 is 1.000000 36 37 patern 0 assigned to cluster 0 38 39 The distance from pattern 1 to cluster 0 is calculated as: 40 d=sqrt( 1.0000+ .0000) 41 Distance from pattern 1 to cluster 0 is 1.000000 42 43 The distance from pattern 1 to cluster 1 is calculated as: 44 d=sqrt( .0000+ .0000) 45 Distance from pattern 1 to cluster 1 is 0.000000 46 47 patern 1 assigned to cluster 1 48 49 The distance from pattern 2 to cluster 0 is calculated as: 50 d=sqrt( .0000+ 1.0000) 51 Distance from pattern 2 to cluster 0 is 1.000000 52 53 The distance from pattern 2 to cluster 1 is calculated as: 54 d=sqrt( 1.0000+ 1.0000) 55 Distance from pattern 2 to cluster 1 is 1.414214 56 57 patern 2 assigned to cluster 0 58 59 The distance from pattern 3 to cluster 0 is calculated as: 60 d=sqrt( 4.0000+ 1.0000) 61 Distance from pattern 3 to cluster 0 is 2.236068 62 63 The distance from pattern 3 to cluster 1 is calculated as: 64 d=sqrt( 1.0000+ 1.0000) 65 Distance from pattern 3 to cluster 1 is 1.414214 66 67 patern 3 assigned to cluster 1 68 69 The distance from pattern 4 to cluster 0 is calculated as: 70 d=sqrt( 1.0000+ 4.0000) 71 Distance from pattern 4 to cluster 0 is 2.236068 72 73 The distance from pattern 4 to cluster 1 is calculated as: 74 d=sqrt( .0000+ 4.0000) 75 Distance from pattern 4 to cluster 1 is 2.000000 76 77 patern 4 assigned to cluster 1 78 79 The distance from pattern 5 to cluster 0 is calculated as: 80 d=sqrt( 4.0000+ 4.0000) 81 Distance from pattern 5 to cluster 0 is 2.828427 82 83 The distance from pattern 5 to cluster 1 is calculated as: 84 d=sqrt( 1.0000+ 4.0000) 85 Distance from pattern 5 to cluster 1 is 2.236068 86 87 patern 5 assigned to cluster 1 88 89 The distance from pattern 6 to cluster 0 is calculated as: 90 d=sqrt( 4.0000+ .0000) 91 Distance from pattern 6 to cluster 0 is 2.000000 92 93 The distance from pattern 6 to cluster 1 is calculated as: 94 d=sqrt( 1.0000+ .0000) 95 Distance from pattern 6 to cluster 1 is 1.000000 96 97 patern 6 assigned to cluster 1 98 99 The distance from pattern 7 to cluster 0 is calculated as: 100 d=sqrt( .0000+ 4.0000) 101 Distance from pattern 7 to cluster 0 is 2.000000 102 103 The distance from pattern 7 to cluster 1 is calculated as: 104 d=sqrt( 1.0000+ 4.0000) 105 Distance from pattern 7 to cluster 1 is 2.236068 106 107 patern 7 assigned to cluster 0 108 109 The distance from pattern 8 to cluster 0 is calculated as: 110 d=sqrt( 49.0000+ 36.0000) 111 Distance from pattern 8 to cluster 0 is 9.219544 112 113 The distance from pattern 8 to cluster 1 is calculated as: 114 d=sqrt( 36.0000+ 36.0000) 115 Distance from pattern 8 to cluster 1 is 8.485281 116 117 patern 8 assigned to cluster 1 118 119 The distance from pattern 9 to cluster 0 is calculated as: 120 d=sqrt( 49.0000+ 49.0000) 121 Distance from pattern 9 to cluster 0 is 9.899495 122 123 The distance from pattern 9 to cluster 1 is calculated as: 124 d=sqrt( 36.0000+ 49.0000) 125 Distance from pattern 9 to cluster 1 is 9.219544 126 127 patern 9 assigned to cluster 1 128 129 The distance from pattern 10 to cluster 0 is calculated as: 130 d=sqrt( 49.0000+ 64.0000) 131 Distance from pattern 10 to cluster 0 is 10.630146 132 133 The distance from pattern 10 to cluster 1 is calculated as: 134 d=sqrt( 36.0000+ 64.0000) 135 Distance from pattern 10 to cluster 1 is 10.000000 136 137 patern 10 assigned to cluster 1 138 139 The distance from pattern 11 to cluster 0 is calculated as: 140 d=sqrt( 64.0000+ 36.0000) 141 Distance from pattern 11 to cluster 0 is 10.000000 142 143 The distance from pattern 11 to cluster 1 is calculated as: 144 d=sqrt( 49.0000+ 36.0000) 145 Distance from pattern 11 to cluster 1 is 9.219544 146 147 patern 11 assigned to cluster 1 148 149 The distance from pattern 12 to cluster 0 is calculated as: 150 d=sqrt( 64.0000+ 49.0000) 151 Distance from pattern 12 to cluster 0 is 10.630146 152 153 The distance from pattern 12 to cluster 1 is calculated as: 154 d=sqrt( 49.0000+ 49.0000) 155 Distance from pattern 12 to cluster 1 is 9.899495 156 157 patern 12 assigned to cluster 1 158 159 The distance from pattern 13 to cluster 0 is calculated as: 160 d=sqrt( 64.0000+ 64.0000) 161 Distance from pattern 13 to cluster 0 is 11.313708 162 163 The distance from pattern 13 to cluster 1 is calculated as: 164 d=sqrt( 49.0000+ 64.0000) 165 Distance from pattern 13 to cluster 1 is 10.630146 166 167 patern 13 assigned to cluster 1 168 169 The distance from pattern 14 to cluster 0 is calculated as: 170 d=sqrt( 64.0000+ 81.0000) 171 Distance from pattern 14 to cluster 0 is 12.041595 172 173 The distance from pattern 14 to cluster 1 is calculated as: 174 d=sqrt( 49.0000+ 81.0000) 175 Distance from pattern 14 to cluster 1 is 11.401754 176 177 patern 14 assigned to cluster 1 178 179 The distance from pattern 15 to cluster 0 is calculated as: 180 d=sqrt( 81.0000+ 49.0000) 181 Distance from pattern 15 to cluster 0 is 11.401754 182 183 The distance from pattern 15 to cluster 1 is calculated as: 184 d=sqrt( 64.0000+ 49.0000) 185 Distance from pattern 15 to cluster 1 is 10.630146 186 187 patern 15 assigned to cluster 1 188 189 The distance from pattern 16 to cluster 0 is calculated as: 190 d=sqrt( 81.0000+ 64.0000) 191 Distance from pattern 16 to cluster 0 is 12.041595 192 193 The distance from pattern 16 to cluster 1 is calculated as: 194 d=sqrt( 64.0000+ 64.0000) 195 Distance from pattern 16 to cluster 1 is 11.313708 196 197 patern 16 assigned to cluster 1 198 199 The distance from pattern 17 to cluster 0 is calculated as: 200 d=sqrt( 81.0000+ 81.0000) 201 Distance from pattern 17 to cluster 0 is 12.727922 202 203 The distance from pattern 17 to cluster 1 is calculated as: 204 d=sqrt( 64.0000+ 81.0000) 205 Distance from pattern 17 to cluster 1 is 12.041595 206 207 patern 17 assigned to cluster 1 208 209 The new cluster centers are now calculated as: 210 Cluster Center0(1/3)( .000+ .000+ .000), 211 (1/3)( .000+ 1.000+ 2.000) 212 Cluster Center1(1/15)( 1.000+ 2.000+ 1.000+ 2.000+ 2.000+ 7.000+ 7.000+ 7.000+ 8 213 .000+ 8.000+ 8.000+ 8.000+ 9.000+ 9.000+ 9.000), 214 (1/15)( .000+ 1.000+ 2.000+ 2.000+ .000+ 6.000+ 7.000+ 8.000+ 6.000+ 7.000+ 8.00 215 0+ 9.000+ 7.000+ 8.000+ 9.000) 216 Cluster centers: 217 ClusterCenter[0]=(0.000000,1.000000) 218 ClusterCenter[1]=(5.866667,5.333333)