ISODATA源代码
作者:liangdas
出处:简单点儿,通俗点儿,机器学习 http://blog.csdn.net/liangdas/article/details/39809845
下面是我写的ISODATA的源代码,分别有三个文件,一个是ISODATA.h头文件,一个是ISODATA.c文件,另外一个是Main.cpp文件。ISODATA.h和ISODATA.c文件中引用了系统的stdio.h,stdlib.h, math.h文件。
这个书写风格和前面的K-Means类似。
后面的main.cpp是介绍怎么使用的,输入是按txt格式存贮的,测试数据的存贮格式和前面的K-Means也类似:
sample number(样本总数)
feature number(特征维数)
intend class number(待分类的类别)
initial class center index(初始类别中心)
feature list as(特征列表):
feature1 feature2 ...
feature1 feature2 ...
......
当然可以自己定义数据的格式,并重先写LoadPatterns()函数。
ISODATA.h
/***********************************
* Author: liangdas
* Time: 20140924
* Version: 0_20140924
* Contact:
* QQ: 358536026 Email: [email protected]
* Working place: Beijing Samsuang Telecom R&D Center
************************************/
#ifndef __ISODATA_H__
#define __ISODATA_H__
#ifdef __cplusplus
extern "C"{
#endif
#define SUCCESS 1
#define FAILURE 0
#define MAX_SAMPLES 1000 //最大样本个数
#define MAX_CLUSTER_NUM 40 //最大类别数(这个值设定成大于预期类数数目的2倍以上)
#define MAX_DIM 10 //最大样本维数
#define h 0.5 //分裂时使用的比值
#define MAXDOUBLE 1.0e20 //最大双精度值
#define DIM 2 //实际样本维数
typedef struct stTwoClusterDist
{
double dist;
int nIndexI;
int nIndexJ;
}TWONEARCLUSTER, *PTWONEARCLUSTER;
typedef struct stCluster
{
double Center[MAX_DIM]; //样本数据
int pMemberIndex[MAX_SAMPLES]; //只想整个数据集的索引号
int nSampleNum;
double nAveDistToCenter;
double fDeltaOfFeature[MAX_DIM];//分量的标准差
int nMaxDeltaOfFeatureIndex; //用于记录类内距离标准差矢量最大的分量下标
}CLASSCLUSTER, *PCLASSCLUSTER;
/********************************************************
* Function: LoadPatterns()
* Descrption: 通过文件名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
*********************************************************/
int LoadPatterns(char *fname, double** pSamples, int* pNumSamples, int* pClusterNum, int* pNumDim, int* pOrgCenterIndex);
/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的是样本序列的前nCurClusterNum样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim, int* pCenterIndex);
/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号sampleID,到第clusterID个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: ReClassify()
* Description: 依据θN判断合并,若类nSampleNum中样本数小于θN,
//这里只更新类别中心,因为调用这个函数的后面就是重新根据类别中心聚类
* Input&Output:
* Returns:
****************************************************************/
short RemoveCenterWithLessNum(PCLASSCLUSTER pCluster, int* pClusterNum, int nThrelNum, int nNumDim);
/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int CalcNewClustCenters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: CalAveDistInCluster()
* Description: 计算每个类别内部的样本到类别中心的平均距离, 和总体的平均距离
* Input&Output:
* Returns:
****************************************************************/
double CalAveDistInCluster(PCLASSCLUSTER pCluster, int nClusterNum, double** pSamples, int nNumSamples, int nNumDim);
/***************************************************************
* Function: CalSigmaInCluster()
* Description: 计算每一个类别中,每一维分量的标准差
* Input&Output:
* Returns:
****************************************************************/
void CalDimSigmaInCluster(PCLASSCLUSTER pCluster, int nClusterNum, double** pSamples, int nNumSamples, int nNumDim);
/***************************************************************
* Function: CalAveDistBetween2Centers()
* Description: 计算所有类别中,两两之间的距离
* Input&Output:
* Returns:
****************************************************************/
void CalAveDistBetween2Centers(PCLASSCLUSTER pCluster, int nClusterNum, int nNumDim, double** pDistBetTwoClusters);
/***************************************************************
* Function: DivideClusters()
* Description: 分裂类别,每次只分裂一次,而且是碰到了满足条件的类别就分裂
* Input&Output:
* Returns:
****************************************************************/
int DivideClusters(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, double dAveTotalCluster, double SIGMA_THRELD, int MAX_SAMPLES_ONE_CLUSTER, int EXEPECT_CLUSTER_NUM, int MIN_SAMPLES_ONE_CLUSTER);
/***************************************************************
* Function: UnionByLessDistBetwCenter()
* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
* Input&Output:
* Returns:
****************************************************************/
int UnionByLessDistBetwCenter(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, int MIN_CLUSTER_DIST, int MERGE_CLUSTER_NUM, double** pDistBetTwoClusters);
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的样本的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int nCurClusterNum, int NumDim);
/***************************************************************
* Function: UnionByLessDistBetwCenter()
* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
* Input&Output:
* Returns:
****************************************************************/
void RunISODATA(double** pSamples, int nNumSamples, PCLASSCLUSTER pCluster, int nNumDim, int* pCenterIndex, int* nCurClusterNum);
#ifdef __cplusplus
}
#endif
#endif
ISODATA.c
/***********************************
* Author: liangdas
* Time: 20140924
* Version: 0_20140924
* Contact:
* QQ: 358536026 Email: [email protected]
* Working place:Beijing Samsuang Telecom R&D Center
************************************/
#include "ISODATA.h"
#include
#include
#include
#ifdef __cplusplus
extern "C"{
#endif
/********************************************************
* Function: LoadPatterns()
* Descrption: 通过文件名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
*********************************************************/
int LoadPatterns(char* fname, double** pSamples, int* pNumSamples, int* pClusterNum, int* pNumDim, int* pOrgCenterIndex)
{
FILE* InFilePtr;
int i,j;
double x;
if((InFilePtr = fopen(fname, "rt")) == NULL)
{
return FAILURE;
}
fscanf(InFilePtr, "%d", pNumSamples);
fscanf(InFilePtr, "%d", pNumDim);
fscanf(InFilePtr, "%d", pClusterNum);
for(i=0; i<*pClusterNum; i++)
{
fscanf(InFilePtr, "%d ", &pOrgCenterIndex[i]);
}
for (i=0; i<*pNumSamples; i++)
{
for (j=0; j<*pNumDim; j++)
{
fscanf(InFilePtr, "%lg", &x);
pSamples[i][j] = x;
}
}
return SUCCESS;
}
/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的是样本序列的前nCurClusterNum样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim, int* pCenterIndex)
{
int i, j;
printf("Initial cluster centers:\n");
if(nCurClusterNum > NumSamples)
{
printf("class number exceed to sample number\n");
}
for (i=0; i= SIGMA_THRELD)
{
if((pCluster[i].nAveDistToCenter > dAveTotalCluster)
&&(pCluster[i].nSampleNum > 2*(MIN_SAMPLES_ONE_CLUSTER+1))
||(*pClusterNum <= EXEPECT_CLUSTER_NUM/2))
{
j = pCluster[i].nMaxDeltaOfFeatureIndex;
for(l=*pClusterNum; l>i; l--)
{
for(k=0; kk; l--)
{
pTwoNearCluster[l] = pTwoNearCluster[l-1];
}
#endif
pTwoNearCluster[k].dist = pDistBetTwoClusters[i][j];
pTwoNearCluster[k].nIndexI = i;
pTwoNearCluster[k].nIndexJ = j;
k++;
break;
}
//}
}
}
}
k = (kpTwoNearCluster[j].dist)
{
double tmpval = pTwoNearCluster[i].dist;
int tmpIndexI = pTwoNearCluster[i].nIndexI;
int tmpIndexJ = pTwoNearCluster[i].nIndexJ;
pTwoNearCluster[i].dist = pTwoNearCluster[j].dist;
pTwoNearCluster[i].nIndexI = pTwoNearCluster[j].nIndexI;
pTwoNearCluster[i].nIndexJ = pTwoNearCluster[j].nIndexJ;
pTwoNearCluster[j].dist = tmpval;
pTwoNearCluster[j].nIndexI = tmpIndexI;
pTwoNearCluster[j].nIndexJ = tmpIndexJ;
}
}
}
//合并类别
for(i=0; i-1 && nIndexJ>-1)
{
if((pCluster[nIndexI].nSampleNum<0)||(pCluster[nIndexJ].nSampleNum<0))
{
continue;
}
for(j=0; j= 2*EXPECT_CLUSTER_NUM)
{
goto step8;
}
else
{
if(iter%2 == 1)
{
goto step7; //分裂操作
}
else
{
goto step8; //合并操作
}
}
step7: //分裂操作
CalDimSigmaInCluster(pCluster, *pCurClusterNum, pSamples, nNumSamples, nNumDim);
if(1 == DivideClusters(pCluster, pCurClusterNum, nNumDim, dAveTotalCluster, SIGMA_THRELD, MIN_SAMPLES_ONE_CLUSTER, EXPECT_CLUSTER_NUM, MIN_SAMPLES_ONE_CLUSTER))
{
iter++;
goto step2;
}
step8: //合并操作
//计算所有类别中,两两之间的距离
CalAveDistBetween2Centers(pCluster, *pCurClusterNum, nNumDim, pDistBetTwoClusters);
//根据类别间的两两之间的距离,合并一些类别
UnionByLessDistBetwCenter(pCluster, pCurClusterNum, nNumDim, MIN_CLUSTER_DIST,
ALLOW_MERGE_CLUSTER_NUM, pDistBetTwoClusters);
step9:
if(iter >= I) //判断循环还是退出
{
printf("---------------经过 %d 次迭代,达到迭代次数--------------\n",iter);
return;
}
else
{
char ch = 0;
iter++;
printf("本次迭代完成,是否需要改变参数(Y/N)??:\n");
while(!isspace(ch = getchar()));
if(ch == 'y'||ch == 'Y')
{
goto start;
}
else goto step2;
}
//delete memory
for(i=0; i
main.cpp
#include
#include
#include
#include
#include "ISODATA.h"
int main(int argc, char *argv[])
{
double** pSamples;
int nNumSamples; // Number of samples
CLASSCLUSTER pCluster[MAX_CLUSTER_NUM];
int nCurClusterNum; // Number of clusters
int pOrgCenterIndex[MAX_CLUSTER_NUM];
int nNumDim; // Number of dimensions in vector
int i = 0;
pSamples = (double**)malloc(sizeof(double*)*MAX_SAMPLES);
for(i=0; i
ps:使用或者转载请标明出处,禁止以商业为目的的使用。
如果有需要word版,或者是pdf版的,请与我联系,QQ:358536026