ISODATA源代码

                                                             ISODATA源代码

作者:liangdas

出处:简单点儿,通俗点儿,机器学习     http://blog.csdn.net/liangdas/article/details/39809845

 

 

        下面是我写的ISODATA的源代码,分别有三个文件,一个是ISODATA.h头文件,一个是ISODATA.c文件,另外一个是Main.cpp文件。ISODATA.h和ISODATA.c文件中引用了系统的stdio.h,stdlib.h, math.h文件。

        这个书写风格和前面的K-Means类似。

        后面的main.cpp是介绍怎么使用的,输入是按txt格式存贮的,测试数据的存贮格式和前面的K-Means也类似:

sample number(样本总数)

feature number(特征维数)

intend class number(待分类的类别)

initial class center index(初始类别中心)

feature list as(特征列表):

feature1 feature2 ...

feature1 feature2 ...
 

......

       当然可以自己定义数据的格式,并重先写LoadPatterns()函数。

      

       ISODATA.h

/***********************************
*	Author: liangdas 
*	Time: 20140924
*	Version: 0_20140924
*	Contact: 
*		QQ: 358536026  Email: [email protected]
*	Working place: Beijing Samsuang Telecom R&D Center
************************************/

#ifndef __ISODATA_H__
#define __ISODATA_H__

#ifdef __cplusplus
extern "C"{
#endif

#define         SUCCESS         1
#define         FAILURE         0


#define MAX_SAMPLES  1000       //最大样本个数
#define MAX_CLUSTER_NUM	 40		//最大类别数(这个值设定成大于预期类数数目的2倍以上)
#define MAX_DIM   10           //最大样本维数
#define h  0.5                 //分裂时使用的比值
#define MAXDOUBLE   1.0e20     //最大双精度值
#define DIM  2                 //实际样本维数

	typedef  struct stTwoClusterDist
	{
		double dist;
		int nIndexI;
		int nIndexJ;
	}TWONEARCLUSTER, *PTWONEARCLUSTER;

	typedef struct stCluster
	{
		double Center[MAX_DIM];			//样本数据
		int pMemberIndex[MAX_SAMPLES];  //只想整个数据集的索引号
		int nSampleNum;
		double nAveDistToCenter;
		double fDeltaOfFeature[MAX_DIM];//分量的标准差
		int nMaxDeltaOfFeatureIndex;	//用于记录类内距离标准差矢量最大的分量下标

	}CLASSCLUSTER, *PCLASSCLUSTER;
	/********************************************************
	* Function: LoadPatterns()
	* Descrption: 通过文件名字,加载样本列表
	* Input&OutPut:
	* Return:
	* File format: 样本数 特征维数 待分类类别数目 样本。。。
	*********************************************************/
	int LoadPatterns(char *fname, double** pSamples, int* pNumSamples, int* pClusterNum, int* pNumDim, int* pOrgCenterIndex);

	/***************************************************************
	* Function: InitClusters()
	* Description: 指定初始类别中心,这个函数取的是样本序列的前nCurClusterNum样本作为聚类的初始类别中心
	* Input&Output:
	* Returns:
	****************************************************************/
	void InitClusters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim, int* pCenterIndex);

	/***************************************************************
	* Function: CalcuDistance()
	* Description: 样本序号sampleID,到第clusterID个类别的距离
	* Input&Output:
	* Returns:
	****************************************************************/
	double CalcuDistance(int sampleID, int clusterID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);

	/***************************************************************
	* Function: FindClosestCluster()
	* Description: 找到样本序号为sampleID的样本,所属的类别
	* Input&Output:
	* Returns:
	****************************************************************/
	int FindClosestCluster(int sampleID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);

	/***************************************************************
	* Function: ReClassify()
	* Description: 重新计算样本所属的类别
	* Input&Output:
	* Returns:
	****************************************************************/
	void ReClassify(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);

	/***************************************************************
	* Function: ReClassify()
	* Description: 依据θN判断合并,若类nSampleNum中样本数小于θN,
	//这里只更新类别中心,因为调用这个函数的后面就是重新根据类别中心聚类
	* Input&Output:
	* Returns:
	****************************************************************/
	short  RemoveCenterWithLessNum(PCLASSCLUSTER pCluster, int* pClusterNum, int nThrelNum, int nNumDim);

	/***************************************************************
	* Function: CalcNewClustCenters()
	* Description: 重新计算类别中心
	* Input&Output:
	* Returns:
	****************************************************************/
	int  CalcNewClustCenters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);

	/***************************************************************
	* Function: CalAveDistInCluster()
	* Description: 计算每个类别内部的样本到类别中心的平均距离, 和总体的平均距离
	* Input&Output:
	* Returns:
	****************************************************************/
	double CalAveDistInCluster(PCLASSCLUSTER pCluster, int nClusterNum, double** pSamples, int nNumSamples, int nNumDim);

	/***************************************************************
	* Function: CalSigmaInCluster()
	* Description: 计算每一个类别中,每一维分量的标准差
	* Input&Output:
	* Returns:
	****************************************************************/
	void CalDimSigmaInCluster(PCLASSCLUSTER pCluster, int nClusterNum, double** pSamples, int nNumSamples, int nNumDim);

	/***************************************************************
	* Function: CalAveDistBetween2Centers()
	* Description: 计算所有类别中,两两之间的距离
	* Input&Output:
	* Returns:
	****************************************************************/
	void CalAveDistBetween2Centers(PCLASSCLUSTER pCluster, int nClusterNum, int nNumDim, double** pDistBetTwoClusters);

	/***************************************************************
	* Function: DivideClusters()
	* Description: 分裂类别,每次只分裂一次,而且是碰到了满足条件的类别就分裂
	* Input&Output:
	* Returns:
	****************************************************************/
	int DivideClusters(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, double dAveTotalCluster, double SIGMA_THRELD, int MAX_SAMPLES_ONE_CLUSTER, int EXEPECT_CLUSTER_NUM, int MIN_SAMPLES_ONE_CLUSTER);

	/***************************************************************
	* Function: UnionByLessDistBetwCenter()
	* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
	* Input&Output:
	* Returns:
	****************************************************************/
	int UnionByLessDistBetwCenter(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, int MIN_CLUSTER_DIST, int MERGE_CLUSTER_NUM, double** pDistBetTwoClusters);

	/***************************************************************
	* Function: SaveClusters()
	* Description: 保存聚类中心的结果
	* Input&Output:
	* Returns:
	****************************************************************/
	void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int nCurClusterNum, int NumDim);

	/***************************************************************
	* Function: SaveClusters()
	* Description: 保存聚类的样本的结果
	* Input&Output:
	* Returns:
	****************************************************************/
	void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
		int nCurClusterNum, int NumDim);

	/***************************************************************
	* Function: UnionByLessDistBetwCenter()
	* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
	* Input&Output:
	* Returns:
	****************************************************************/
	void RunISODATA(double** pSamples, int nNumSamples, PCLASSCLUSTER pCluster, int nNumDim, int* pCenterIndex, int* nCurClusterNum);


#ifdef __cplusplus
}
#endif

#endif

 

ISODATA.c

 

 

/***********************************
*	Author: liangdas 
*	Time: 20140924
*	Version: 0_20140924
*	Contact: 
*		QQ: 358536026  Email: [email protected]
*	Working place:Beijing Samsuang Telecom R&D Center
************************************/
#include "ISODATA.h"
#include 
#include 
#include 

#ifdef __cplusplus
extern "C"{
#endif

	/********************************************************
	* Function: LoadPatterns()
	* Descrption: 通过文件名字,加载样本列表
	* Input&OutPut:
	* Return:
	* File format: 样本数 特征维数 待分类类别数目 样本。。。
	*********************************************************/
	int LoadPatterns(char* fname, double** pSamples, int* pNumSamples, int* pClusterNum, int* pNumDim, int* pOrgCenterIndex)
	{
		FILE* InFilePtr;
		int    i,j;
		double x;
		if((InFilePtr = fopen(fname, "rt")) == NULL)
		{
			return FAILURE;
		}
		fscanf(InFilePtr, "%d", pNumSamples); 
		fscanf(InFilePtr, "%d", pNumDim);
		fscanf(InFilePtr, "%d", pClusterNum);
		for(i=0; i<*pClusterNum; i++)
		{
			fscanf(InFilePtr, "%d ", &pOrgCenterIndex[i]);
		}
		for (i=0; i<*pNumSamples; i++)
		{         
			for (j=0; j<*pNumDim; j++)
			{
				fscanf(InFilePtr, "%lg", &x);
				pSamples[i][j] = x;
			}
		}
		return SUCCESS;
	}

	/***************************************************************
	* Function: InitClusters()
	* Description: 指定初始类别中心,这个函数取的是样本序列的前nCurClusterNum样本作为聚类的初始类别中心
	* Input&Output:
	* Returns:
	****************************************************************/
	void InitClusters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim, int* pCenterIndex)
	{
		int i, j;

		printf("Initial cluster centers:\n");
		if(nCurClusterNum > NumSamples)
		{
			printf("class number exceed to sample number\n");
		}

		for (i=0; i= SIGMA_THRELD)
			{
				if((pCluster[i].nAveDistToCenter > dAveTotalCluster)
				&&(pCluster[i].nSampleNum > 2*(MIN_SAMPLES_ONE_CLUSTER+1))
				||(*pClusterNum <= EXEPECT_CLUSTER_NUM/2))
				{
					j = pCluster[i].nMaxDeltaOfFeatureIndex;
					for(l=*pClusterNum; l>i; l--)
					{
						for(k=0; kk; l--)
						{
							pTwoNearCluster[l] = pTwoNearCluster[l-1];
						}
#endif
						pTwoNearCluster[k].dist = pDistBetTwoClusters[i][j];
						pTwoNearCluster[k].nIndexI = i;
						pTwoNearCluster[k].nIndexJ = j;
						k++;
						break;
					}
					//}
				}
			}
		}
		k = (kpTwoNearCluster[j].dist)
				{
					double tmpval = pTwoNearCluster[i].dist;
					int tmpIndexI = pTwoNearCluster[i].nIndexI;
					int tmpIndexJ = pTwoNearCluster[i].nIndexJ;

					pTwoNearCluster[i].dist = pTwoNearCluster[j].dist;
					pTwoNearCluster[i].nIndexI = pTwoNearCluster[j].nIndexI;
					pTwoNearCluster[i].nIndexJ = pTwoNearCluster[j].nIndexJ;

					pTwoNearCluster[j].dist = tmpval;
					pTwoNearCluster[j].nIndexI = tmpIndexI;
					pTwoNearCluster[j].nIndexJ = tmpIndexJ;
				}
			}
		}
		//合并类别
		for(i=0; i-1 && nIndexJ>-1)
			{

				if((pCluster[nIndexI].nSampleNum<0)||(pCluster[nIndexJ].nSampleNum<0))
				{
					continue;
				}
				for(j=0; j= 2*EXPECT_CLUSTER_NUM)
		{
			goto step8; 
		}
		else
		{
			if(iter%2 == 1)
			{
				goto step7;  //分裂操作
			}
			else
			{
				goto step8;  //合并操作
			}
		}

step7: //分裂操作
		CalDimSigmaInCluster(pCluster, *pCurClusterNum, pSamples, nNumSamples, nNumDim);
		if(1 == DivideClusters(pCluster, pCurClusterNum, nNumDim, dAveTotalCluster, SIGMA_THRELD, MIN_SAMPLES_ONE_CLUSTER, EXPECT_CLUSTER_NUM, MIN_SAMPLES_ONE_CLUSTER))
		{
			iter++;
			goto step2;
		}

step8: //合并操作
		//计算所有类别中,两两之间的距离
		CalAveDistBetween2Centers(pCluster, *pCurClusterNum, nNumDim, pDistBetTwoClusters);
		//根据类别间的两两之间的距离,合并一些类别
		UnionByLessDistBetwCenter(pCluster, pCurClusterNum, nNumDim, MIN_CLUSTER_DIST,
			ALLOW_MERGE_CLUSTER_NUM, pDistBetTwoClusters);

step9:
		if(iter >= I)         //判断循环还是退出
		{
			printf("---------------经过 %d 次迭代,达到迭代次数--------------\n",iter);
			return;
		}
		else
		{
			char ch = 0;
			iter++;
			printf("本次迭代完成,是否需要改变参数(Y/N)??:\n");
			while(!isspace(ch = getchar()));
			if(ch == 'y'||ch == 'Y')
			{
				goto start;
			}
			else goto step2;

		}
		//delete memory
		for(i=0; i


main.cpp

 

 

#include 
#include 
#include 
#include 
#include "ISODATA.h"


int main(int argc, char *argv[])
{	
	double** pSamples;
	int nNumSamples;			// Number of samples

	CLASSCLUSTER pCluster[MAX_CLUSTER_NUM];
	int nCurClusterNum;			// Number of clusters

	int pOrgCenterIndex[MAX_CLUSTER_NUM];
	int nNumDim;				// Number of dimensions in vector

	int i = 0;

	pSamples = (double**)malloc(sizeof(double*)*MAX_SAMPLES);
	for(i=0; i

 

 

 

ps:使用或者转载请标明出处,禁止以商业为目的的使用。

如果有需要word版,或者是pdf版的,请与我联系,QQ358536026

 

 

 

 

 

 

 

 

你可能感兴趣的:(机器学习,源代码,ISODATA)