/****************************************************************************
* *
* KMEANS Cluster Algorithm *
* *
*****************************************************************************/
/*1、用vector实现其存储
2、直接在程序中读取数据集
3、结果可以保存到文件中
4、用户可以输入聚类个数
5、初始聚类中心随机选择(代码自动随机)
作者:郭运凯
email: [email protected]
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#include <math.h>
#include<time.h>
#include <vector>
#include <iostream>
using namespace std;
// FUNCTION PROTOTYPES
// DEFINES
#define SUCCESS 1
#define FAILURE 0
#define TRUE 1
#define FALSE 0
#define MAXVECTDIM 20
#define MAXDB 1000
#define MAXCLUSTER 10
// ***** Defined structures & classes *****
struct aCluster
{
double Center[MAXVECTDIM];
int Member[MAXDB]; //存储属于该聚类的pattenID,下标从0开始
int NumMembers;
};
struct aVector
{
double Center[MAXVECTDIM];
int Size;
};
typedef struct DNode // the vector that stored the data of one row
{
vector<double> data;
}DNode;
class ClusterData
{
private:
vector<DNode> DB;
aCluster Cluster[MAXCLUSTER];
int nTotalRowNum; // 行数
int nDimensionNum; // 列数
int nClusterNum; // 聚类数量
void DistributeSamples(); // Step 2 of K-means algorithm
int CalcNewClustCenters();// Step 3 of K-means algorithm
double EucNorm(int, int); // Calc Euclidean norm vector
int FindClosestCluster(int); //ret indx of clust closest to DB
//whose index is arg
public:
//ClusterData();
int ReadFile(char *fname); // Get DB data to be clustered
void InitClusters(); // Step 1 of K-means algorithm
void RunKMeans(); // Overall control K-means process
void DisplayCluster(); // Show results on screen
void DisplayCenter();
void SaveToFile(); //3、结果可以保存到文件中
};
char *f2a(double x, int width)
{
char cbuf[255];
char *cp;
int i,k;
int d,s;
cp=fcvt(x,width,&d,&s);
if (s)
{
strcpy(cbuf,"-");
}
else
{
strcpy(cbuf," ");
} /* endif */
if (d>0) {
for (i=0; i<d; i++) {
cbuf[i+1]=cp[i];
} /* endfor */
cbuf[d+1]=0;
cp+=d;
strcat(cbuf,".");
strcat(cbuf,cp);
} else {
if (d==0) {
strcat(cbuf,".");
strcat(cbuf,cp);
}
else {
k=-d;
strcat(cbuf,".");
for (i=0; i<k; i++) {
strcat(cbuf,"0");
} /* endfor */
strcat(cbuf,cp);
} /* endif */
} /* endif */
cp=&cbuf[0];
return cp;
}
void ClusterData::DisplayCenter()
{
int i,j;
printf("Cluster centers:\n");
for (i=0; i<nClusterNum; i++)
{
//Cluster[i].Member[0]=i; //此条语句使得最后结果中,各个聚类的第一个结果分别是i,造成错误
//聚类的初始化工作已经在InitClusters()中完成,这里不能再写此语句,否则造成错误。郭运凯。2010.10.20 20:50修改
printf("ClusterCenter[%d]=(",i);
for (j = 0;j<nDimensionNum-1;j++)
{
printf("%f,",Cluster[i].Center[j]);
}
printf("%f)\n",Cluster[i].Center[j]);
} /* endfor */
}
int ClusterData::ReadFile(char *filename)
{
//1、用vector实现其存储
//2、直接在程序中读取数据集
FILE *fp;
int i,j;
double x;
if((fp = fopen(filename, "r")) == NULL)
return FAILURE;
char str[1000];
fgets(str,1000,fp);
int len = strlen(str);
i = 0;
nDimensionNum = 0;
while(str[i] != '\n')
{
if (' '== str[i] ||'\t'== str[i] )
{
nDimensionNum ++;
i++;
while(' '== str[i] && '\n' != str[i] )
{
i++;
}
}
i++;
}
nDimensionNum ++;
printf("%d,%d\n",len,nDimensionNum);
nTotalRowNum = 1;
while(fgets(str,1000,fp) != NULL)
{
nTotalRowNum ++;
}
printf("%d\n",nTotalRowNum);
fclose(fp);
fp = fopen(filename,"r");
for (i=0; i<nTotalRowNum; i++)
{ // For each vector
DNode t;
for (j=0; j<nDimensionNum; j++)
{ // create a DB
fscanf(fp,"%lg",&x); // consisting of all elements
t.data.push_back(x);
} /* endfor */
DB.push_back(t);
} /* endfor */
printf("Input DBs:\n");
for (i=0; i<nTotalRowNum; i++)
{
printf("DB[%d]=(",i);
for(int j = 0;j< nDimensionNum-1;j++)
printf("%3.2f,",DB[i].data[j]);
printf("%3.2f)\n",DB[i].data[j]);
} /* endfor */
printf("\n--------------------\n");
return SUCCESS;
}
//***************************************************************************
// InitClusters *
// Arbitrarily assign a vector to each of the K clusters *
// We choose the first K vectors to do this *
//***************************************************************************
void ClusterData::InitClusters()
{
int i,j;
printf("input the Cluster Num:"); // 4、用户可以输入聚类个数
scanf("%d",&nClusterNum);
printf("Initial cluster centers:\n");
srand((int)time(0));
int randid;
//5、初始聚类中心随机选择(代码自动随机)
for (i=0; i<nClusterNum; i++)
{
randid = (int)(nTotalRowNum*rand()/(RAND_MAX+1.0));
printf("randid = %d\n",randid);
Cluster[i].Member[0]=randid;
for (j=0; j<nDimensionNum; j++)
{
Cluster[i].Center[j]=DB[randid].data[j];
} /* endfor */
} /* endfor */
for (i=0; i<nClusterNum; i++)
{
printf("ClusterCenter[%d]=(",i);
for (int j = 0;j < nDimensionNum-1;j++)
{
printf("%f,",Cluster[i].Center[j]);
}
printf("%f)\n",Cluster[i].Center[j]);
} /* endfor */
printf("\n");
}//end of InitClusters()
void ClusterData::RunKMeans()
{
int converged;
int pass;
pass=1;
converged=FALSE;
while (converged==FALSE)
{
printf("PASS=%d\n",pass++);
DistributeSamples();
converged=CalcNewClustCenters();
DisplayCenter();
} /* endwhile */
}
double ClusterData::EucNorm(int p, int c)
{ // 计算 输入模式 p 和 聚类中心 c 之间的欧几里得距离
//Calc Euclidean norm of vector difference
double dist,x; // between DB vector, p, and cluster
int i; // center, c.
char znum[40];
char *pnum;
pnum=&znum[0];
//strcpy(zout,"d=sqrt(");
//printf("The distance from DB %d to cluster %d is calculated as:\n",p,c);
dist=0;
for (i=0; i<nDimensionNum ;i++)
{
x=(Cluster[c].Center[i]-DB[p].data[i])*(Cluster[c].Center[i]-DB[p].data[i]);
// strcat(zout,f2a(x,4));
// if (i<nDimensionNum-1)
// strcat(zout," +");
dist += (Cluster[c].Center[i]-DB[p].data[i])*(Cluster[c].Center[i]-DB[p].data[i]);
} /* endfor */
//printf("%s)\n",zout);
return dist;
}
int ClusterData::FindClosestCluster(int pat)
{
int i, ClustID;
double MinDist, d;
MinDist =9.9e+99;
ClustID=-1;
//printf("in the FindClosestCluster the pat = %d\n",pat);
for (i=0; i<nClusterNum; i++)
{
d=EucNorm(pat,i);
//printf("Distance from DB %d to cluster %d is %f\n\n",pat,i,sqrt(d));
if (d<MinDist)
{
MinDist=d;
ClustID=i;
} /* endif */
} /* endfor */
if (ClustID<0)
{
printf("Aaargh");
exit(0);
} /* endif */
return ClustID;
}
void ClusterData::DistributeSamples()
{
int i,pat,Clustid,MemberIndex;
//Clear membership list for all current clusters
for (i=0; i<nClusterNum;i++)
{
Cluster[i].NumMembers=0;
}
for (pat=0; pat<nTotalRowNum; pat++)
{
//Find cluster center to which the DB is closest
Clustid= FindClosestCluster(pat);
// printf("DB %d assigned to cluster %d\n\n",pat,Clustid);
//post this DB to the cluster
// printf("Cluster[Clustid].NumMembers= %d\n",Cluster[Clustid].NumMembers);
MemberIndex=Cluster[Clustid].NumMembers;
Cluster[Clustid].Member[MemberIndex]=pat;
/*printf("cluster [%d] has [%d]\n",Clustid,MemberIndex+1);
for (int i = 0;i< MemberIndex+1;i++)
{
printf("%d,",Cluster[Clustid].Member[i]);
}
printf("\n");*/
Cluster[Clustid].NumMembers++;
} /* endfor */
}
int ClusterData::CalcNewClustCenters()
{
int ConvFlag,VectID,i,j,k;
double tmp[MAXVECTDIM];
ConvFlag=TRUE;
//printf("The new cluster centers are now calculated as:\n");
for (i=0; i<nClusterNum; i++)
{ //for each cluster
for (j=0; j<nDimensionNum; j++)
{ // clear workspace
tmp[j]=0.0;
} /* endfor */
for (j=0; j<Cluster[i].NumMembers; j++)
{ //traverse member vectors
VectID=Cluster[i].Member[j];
for (k=0; k<nDimensionNum; k++)
{ //traverse elements of vector
tmp[k] += DB[VectID].data[k]; // add (member) DB elmnt into temp
} /* endfor */
} /* endfor */
for (k=0; k<nDimensionNum; k++)
{ //traverse elements of vector
tmp[k]=tmp[k]/Cluster[i].NumMembers;
if (tmp[k] != Cluster[i].Center[k])
ConvFlag=FALSE;
Cluster[i].Center[k]=tmp[k];
} /* endfor */
} /* endfor */
return ConvFlag;
}
void ClusterData::DisplayCluster()
{
int cl;
for (cl=0; cl<nClusterNum; cl++)
{
printf("\nCLUSTER %d 's Center [",cl);
for (int j = 0;j<nDimensionNum-1;j++)
{
printf("%f,",Cluster[cl].Center[j]);
}
printf("%f]\n",Cluster[cl].Center[j]);
printf("It's %d Memnbers are :[",Cluster[cl].NumMembers);
for (int i = 0;i<Cluster[cl].NumMembers-1;i++)
{
printf("%d,",Cluster[cl].Member[i]);
}
printf("%d]\n",Cluster[cl].Member[i]);
} /* endfor */
}
void ClusterData::SaveToFile()
{
// 3、结果可以保存到文件中
FILE * fp;
fp = fopen("result.txt","w");
int cl;
for (cl=0; cl<nClusterNum; cl++)
{
fprintf(fp,"\nCLUSTER %d 's Center [",cl);
for (int j = 0;j<nDimensionNum-1;j++)
{
fprintf(fp,"%f,",Cluster[cl].Center[j]);
}
fprintf(fp,"%f]\n",Cluster[cl].Center[j]);
fprintf(fp,"It's %d Memnbers are :[",Cluster[cl].NumMembers);
for (int i = 0;i<Cluster[cl].NumMembers-1;i++)
{
fprintf(fp,"%d,",Cluster[cl].Member[i]);
}
fprintf(fp,"%d]\n",Cluster[cl].Member[i]);
}
fclose(fp);
}
void main(int argc, char *argv[])
{
ClusterData kmeans;
if (kmeans.ReadFile("iris.txt")==FAILURE )
{
printf("UNABLE TO READ DB_FILE:%s\n",argv[1]);
exit(0);
}
kmeans.InitClusters();
kmeans.RunKMeans();
kmeans.DisplayCluster();
kmeans.SaveToFile();
printf("Press any key to continue");
char ch;
scanf("%c",&ch);
}