数据挖掘算法之 kmeans

/****************************************************************************
*                                                                           *
*  KMEANS   Cluster Algorithm                                               *
*                                                                           *
*****************************************************************************/
/*1、用vector实现其存储
2、直接在程序中读取数据集
3、结果可以保存到文件中
4、用户可以输入聚类个数
5、初始聚类中心随机选择(代码自动随机)

作者:郭运凯

email: [email protected]

 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#include <math.h>
#include<time.h>
#include <vector>
#include <iostream>
using namespace  std;
// FUNCTION PROTOTYPES

// DEFINES
#define         SUCCESS         1
#define         FAILURE         0
#define         TRUE            1
#define         FALSE           0
#define         MAXVECTDIM      20
#define         MAXDB      1000
#define         MAXCLUSTER      10
 
 

// ***** Defined structures & classes *****
struct aCluster
{
   double       Center[MAXVECTDIM];
   int          Member[MAXDB];  //存储属于该聚类的pattenID,下标从0开始
   int          NumMembers;
};
struct aVector
{
   double       Center[MAXVECTDIM];
   int          Size;
};
typedef struct DNode  // the vector that stored the data of one row
{
 vector<double> data;
}DNode;
class ClusterData
{
private:
   vector<DNode>       DB;
   aCluster     Cluster[MAXCLUSTER];
   int          nTotalRowNum;          // 行数
   int          nDimensionNum;           // 列数
   int          nClusterNum;          // 聚类数量
   void         DistributeSamples();  // Step 2 of K-means algorithm
   int          CalcNewClustCenters();// Step 3 of K-means algorithm
   double       EucNorm(int, int);   // Calc Euclidean norm vector
   int          FindClosestCluster(int); //ret indx of clust closest to DB
                                         //whose index is arg
public:
   //ClusterData();
   int ReadFile(char *fname);      // Get DB data to be clustered
   void InitClusters();                // Step 1 of K-means algorithm
   void RunKMeans();                   // Overall control K-means process
   void DisplayCluster();                // Show results on screen
   void DisplayCenter();
   void SaveToFile();  //3、结果可以保存到文件中
};

char *f2a(double x, int width)
{
 char cbuf[255];
 char *cp;
 int i,k;
 int d,s;
 
 cp=fcvt(x,width,&d,&s);
 
 if (s)
 {
  strcpy(cbuf,"-");
 }
 else
 {
  strcpy(cbuf," ");
 } /* endif */
 
 if (d>0) {
  for (i=0; i<d; i++) {
   cbuf[i+1]=cp[i];
  } /* endfor */
  cbuf[d+1]=0;
  cp+=d;
  strcat(cbuf,".");
  strcat(cbuf,cp);
 } else {
  if (d==0) {
   strcat(cbuf,".");
   strcat(cbuf,cp);
  }
  else {
   k=-d;
   strcat(cbuf,".");
   for (i=0; i<k; i++) {
    strcat(cbuf,"0");
            } /* endfor */
   strcat(cbuf,cp);
  } /* endif */
 } /* endif */
 cp=&cbuf[0];
 return cp;
}
void ClusterData::DisplayCenter()
{
int i,j;
printf("Cluster centers:\n");
for (i=0; i<nClusterNum; i++)
{
   //Cluster[i].Member[0]=i;  //此条语句使得最后结果中,各个聚类的第一个结果分别是i,造成错误
 //聚类的初始化工作已经在InitClusters()中完成,这里不能再写此语句,否则造成错误。郭运凯。2010.10.20 20:50修改
   printf("ClusterCenter[%d]=(",i);
   for (j = 0;j<nDimensionNum-1;j++)
   {
     printf("%f,",Cluster[i].Center[j]);
   }
   printf("%f)\n",Cluster[i].Center[j]);
 
 } /* endfor */
}
int ClusterData::ReadFile(char *filename)
{
 //1、用vector实现其存储
 //2、直接在程序中读取数据集
   FILE *fp;
   int    i,j;
   double x;
 if((fp = fopen(filename, "r")) == NULL)
  return FAILURE;
   char str[1000];
 
 fgets(str,1000,fp);
 int len = strlen(str);
 i = 0;
 nDimensionNum = 0;
 while(str[i] != '\n')
 {
  if (' '== str[i] ||'\t'== str[i] )
  {
   nDimensionNum ++;
   i++;
   while(' '== str[i] && '\n' != str[i] )
   {
    i++;
   }
  }
  i++;
 
 }
 nDimensionNum ++;
 printf("%d,%d\n",len,nDimensionNum);
 nTotalRowNum = 1;
 while(fgets(str,1000,fp) != NULL)
 {
  nTotalRowNum ++;
 }
 printf("%d\n",nTotalRowNum);

 fclose(fp);
 fp = fopen(filename,"r");
 
 
for (i=0; i<nTotalRowNum; i++)
{         // For each vector
 DNode t;
   for (j=0; j<nDimensionNum; j++)
   {       // create a DB
      fscanf(fp,"%lg",&x);       // consisting of all elements
       t.data.push_back(x);
   } /* endfor */
   DB.push_back(t);
} /* endfor */
printf("Input DBs:\n");
for (i=0; i<nTotalRowNum; i++)
{
   printf("DB[%d]=(",i);
   for(int j = 0;j< nDimensionNum-1;j++)
    printf("%3.2f,",DB[i].data[j]);
   printf("%3.2f)\n",DB[i].data[j]);
 } /* endfor */
printf("\n--------------------\n");
return SUCCESS;
}
//***************************************************************************
// InitClusters                                                             *
//   Arbitrarily assign a vector to each of the K clusters                  *
//   We choose the first K vectors to do this                               *
//***************************************************************************

void ClusterData::InitClusters()
{
int i,j;
printf("input the Cluster Num:");   //  4、用户可以输入聚类个数
scanf("%d",&nClusterNum);
printf("Initial cluster centers:\n");
 srand((int)time(0));
 
   int randid;
 //5、初始聚类中心随机选择(代码自动随机)
for (i=0; i<nClusterNum; i++)

 randid = (int)(nTotalRowNum*rand()/(RAND_MAX+1.0));
 printf("randid = %d\n",randid);
   Cluster[i].Member[0]=randid;
   for (j=0; j<nDimensionNum; j++)
   {
      Cluster[i].Center[j]=DB[randid].data[j];
    } /* endfor */
} /* endfor */ 

for (i=0; i<nClusterNum; i++)
{
 printf("ClusterCenter[%d]=(",i);
 for (int j = 0;j < nDimensionNum-1;j++)
 {
  printf("%f,",Cluster[i].Center[j]);
 }
   printf("%f)\n",Cluster[i].Center[j]);
   } /* endfor */
printf("\n");
}//end of InitClusters()
void ClusterData::RunKMeans()
{
  int converged;
  int pass;
pass=1;
converged=FALSE;
while (converged==FALSE)
 {
   printf("PASS=%d\n",pass++);
   DistributeSamples();
   converged=CalcNewClustCenters();
   DisplayCenter();
   } /* endwhile */
}
double ClusterData::EucNorm(int p, int c)
{   // 计算 输入模式 p 和 聚类中心 c 之间的欧几里得距离
 //Calc Euclidean norm of vector difference
double dist,x;                          // between DB vector, p, and cluster
int i;                                  // center, c.
char znum[40];
char *pnum;
pnum=&znum[0];
 //strcpy(zout,"d=sqrt(");
 //printf("The distance from DB %d to cluster %d is calculated as:\n",p,c);
dist=0;
for (i=0; i<nDimensionNum ;i++)
{
 x=(Cluster[c].Center[i]-DB[p].data[i])*(Cluster[c].Center[i]-DB[p].data[i]);
  // strcat(zout,f2a(x,4));
  // if (i<nDimensionNum-1)
     // strcat(zout," +");
   dist += (Cluster[c].Center[i]-DB[p].data[i])*(Cluster[c].Center[i]-DB[p].data[i]);
   } /* endfor */
//printf("%s)\n",zout);
return dist;
}
int ClusterData::FindClosestCluster(int pat)
{
   int i, ClustID;
   double MinDist, d;
MinDist =9.9e+99;
ClustID=-1;
//printf("in the FindClosestCluster the pat = %d\n",pat);
for (i=0; i<nClusterNum; i++)
{
   d=EucNorm(pat,i);
   //printf("Distance from DB %d to cluster %d is %f\n\n",pat,i,sqrt(d));
   if (d<MinDist)
   {
      MinDist=d;
      ClustID=i;
   } /* endif */
} /* endfor */
if (ClustID<0)
{
   printf("Aaargh");
   exit(0);
} /* endif */
return ClustID;
}
void ClusterData::DistributeSamples()
{
int i,pat,Clustid,MemberIndex;
//Clear membership list for all current clusters
for (i=0; i<nClusterNum;i++)
{
   Cluster[i].NumMembers=0;
}
for (pat=0; pat<nTotalRowNum; pat++)
{
   //Find cluster center to which the DB is closest
   Clustid= FindClosestCluster(pat);
  // printf("DB %d assigned to cluster %d\n\n",pat,Clustid);
   //post this DB to the cluster
  // printf("Cluster[Clustid].NumMembers= %d\n",Cluster[Clustid].NumMembers);
  
   MemberIndex=Cluster[Clustid].NumMembers;
   Cluster[Clustid].Member[MemberIndex]=pat;
   /*printf("cluster [%d] has [%d]\n",Clustid,MemberIndex+1);
   for (int i = 0;i< MemberIndex+1;i++)
   {
    printf("%d,",Cluster[Clustid].Member[i]);
   }
   printf("\n");*/
   Cluster[Clustid].NumMembers++;
} /* endfor */
}
int  ClusterData::CalcNewClustCenters()
{
   int ConvFlag,VectID,i,j,k;
   double tmp[MAXVECTDIM];
 
ConvFlag=TRUE;
//printf("The new cluster centers are now calculated as:\n");
for (i=0; i<nClusterNum; i++)
 {              //for each cluster
  
 for (j=0; j<nDimensionNum; j++)
   {            // clear workspace
      tmp[j]=0.0;
   } /* endfor */
   for (j=0; j<Cluster[i].NumMembers; j++)
   { //traverse member vectors
      VectID=Cluster[i].Member[j];
      for (k=0; k<nDimensionNum; k++)
   {         //traverse elements of vector
         tmp[k] += DB[VectID].data[k];       // add (member) DB elmnt into temp
         } /* endfor */
   } /* endfor */
   for (k=0; k<nDimensionNum; k++)
   {            //traverse elements of vector
      tmp[k]=tmp[k]/Cluster[i].NumMembers;
      if (tmp[k] != Cluster[i].Center[k])
         ConvFlag=FALSE;
      Cluster[i].Center[k]=tmp[k];
    } /* endfor */
  } /* endfor */
return ConvFlag;
}
void ClusterData::DisplayCluster()
{
   int cl;
for (cl=0; cl<nClusterNum; cl++)
{
  
  printf("\nCLUSTER %d 's Center [",cl);
   for (int j = 0;j<nDimensionNum-1;j++)
   {
    printf("%f,",Cluster[cl].Center[j]);
   }
   printf("%f]\n",Cluster[cl].Center[j]);
   printf("It's %d Memnbers are :[",Cluster[cl].NumMembers);
   for (int i = 0;i<Cluster[cl].NumMembers-1;i++)
   {
    printf("%d,",Cluster[cl].Member[i]);
   }
   printf("%d]\n",Cluster[cl].Member[i]);
 } /* endfor */
  
 
}
void ClusterData::SaveToFile()
{
 // 3、结果可以保存到文件中
 FILE * fp;
 fp = fopen("result.txt","w");
    int cl;
    for (cl=0; cl<nClusterNum; cl++)
    {
    
     fprintf(fp,"\nCLUSTER %d 's Center [",cl);
     for (int j = 0;j<nDimensionNum-1;j++)
     {
     fprintf(fp,"%f,",Cluster[cl].Center[j]);
     }
     fprintf(fp,"%f]\n",Cluster[cl].Center[j]);
    
     fprintf(fp,"It's %d Memnbers are :[",Cluster[cl].NumMembers);
     for (int i = 0;i<Cluster[cl].NumMembers-1;i++)
     {
      fprintf(fp,"%d,",Cluster[cl].Member[i]);
     }
     fprintf(fp,"%d]\n",Cluster[cl].Member[i]);
    }
fclose(fp);
}
 
void main(int argc, char *argv[])
{
   ClusterData kmeans;

 if (kmeans.ReadFile("iris.txt")==FAILURE )
 {
 printf("UNABLE TO READ DB_FILE:%s\n",argv[1]);
 exit(0);
   }
kmeans.InitClusters();
kmeans.RunKMeans();
kmeans.DisplayCluster();
kmeans.SaveToFile();
printf("Press any key to continue");
char ch;
scanf("%c",&ch);
 
}

 

你可能感兴趣的:(数据挖掘算法之 kmeans)