机器学习分为有监督学习和无监督学习两种,有监督学习包括分类和回归,无监督学习中一个重要的部分就是聚类了。在我看来,聚类主要有两个准则和一个思想。两个准则是:类内距离最小,类间距离最大;一个思想是:EM(Estimation andMaximization)思想。类内距离最小准则表现在如K-均值法、模糊C-均值法(fussy c-means,FCM)等算法中;类间距离最大准则则表现在分层聚类算法中。仔细研究基于这两个准则的算法的理论,可以看到EM思想的影子,于是直觉告诉我聚类的两个准则和EM思想是等价或包含的关系,对于这样的猜想的证明不是在我能力范围内的,这里只是提出这样的一个观点。
EM思想来源于极大似然估计理论,极大似然估计本质是这样的:对于现有的一个样本集,已知其总体分布,现在要对这个总体分布的参数进行估计,使得在该参数下的总体分布模型产生的总体中,进行采样得到现有样本集的概率最大。要说明的是已有的样本集中的样本不是独立同分布的,而是不同类别的样本服从不同的分布,这也是监督学习和无监督学习的一个区别,监督学习的样本一般是独立同分布的。这就涉及到这样一个问题:是不是标记了的训练样本就一定要采用有监督的学习方法呢?一般情况是这样的,但是有时候对于类间交叉部分样本而言,无监督学习所得到的错误率比有监督学习得到的错误率要低,产生这种情况的本质就是监督学习和无监督学习在假设训练样本总体分布时的不同。因此对于有标记的训练样本集,可以先对不同类别的训练样本进行假设检验,如不同类别训练样本的方差有无显著性差异等,再决定选用有监督学习还是无监督学习。
对EM思想介绍较好的可以参考博客:http://blog.csdn.net/zouxy09。
再说说混合高斯模型(GMM)吧,GMM实质上式EM算法的一个特例,该算法假设训练样本集中不同类别的样本服从不同的高斯分布。通过构造极大似然函数L(P,Z)来进行参数估计,该似然函数有两个未知参数:P,样本的总体分布参数;Z样本的类别。如果直接对L求导数是不能直接同时求出P和Z的,但是如果知道其中的任意一个参数都可以将另外一个参数求出来,于是先随机假设训练样本的类别,然后求出估计参数P,再由在估计参数P下的总体分布更新训练样本的新类别,就在这样的一个循环迭代的过程中使得极大似然函数L(P,X)值达到极大,详细理论可以参看EM算法推导过程,不过这可能是一个局部极大值。EM算法是一个非常伟大的算法,其推导过程也展现了数学的无穷魅力,这里不详述了,有时间慢慢品味吧!
最后给出一个用C++实现的GMM算法的源码吧,虽然这样的源码也容易找到,找了几个,不过都觉得他们的代码太难理解了,于是花了两天时间自己写了一个,希望您有所帮助!GMM算法理论参考博文:
http://blog.csdn.net/junnan321/article/details/8483351
头文件: #ifndef MYGMM_H #define MYGMM_H class MyGmm{ public: MyGmm():m_dtNum(0), m_dtDim(0), m_numCluster(0), m_data(0), m_weight(0), m_p(0), m_mean(0), m_covariance(0){} ~MyGmm(){ Dispose(); } /************************************** *The most important interface for users. ***************************************/ void Train(double** data, int dtNum, int dtDim, int numCluster, int maxiter); /************************************ * Show those internal data interface. *************************************/ void ShowWeight(); void ShowP(); void ShowMean(); void ShowCovariance(); void ShowData(); private: /************************ *Prevent copy and assign. *************************/ MyGmm(const MyGmm&); const MyGmm& operator=(const MyGmm&); /*************************************************** * Repeat the E step and M step untile it reaches * a certain condition according to your requierment. ****************************************************/ void Iterate(); /*********** *E step. ************/ void ComputeEstimationWeight(); /******** *M step. *********/ void ComputeMaximizationCondition(); /*********************************************** *Set those dynamically allocated memory to zero, *such as m_weight,m_p, etc. ************************************************/ void SetM_CovrianceZero(); void SetM_PZero(); void SetM_WeightZero(); void SetM_MeanZero(); /*********************************************************** *Initialize random cluster, and compute probability of each *class which cosumes the whole samples,mean of each class' *samples,covariance of each class. ************************************************************/ void InitRandomCluster(); /********************** *Allocate temp space *such as m_weight,m_p, etc. ***********************/ bool AllocateTempSpace(double** data, int dtNum, int dtDim, int numCluster, int maxiter); /******************************************************************** *Release those dynamically allocated memory,such as m_weight,m_p, etc. *********************************************************************/ bool Dispose(); private: int m_dtNum; //number of samples int m_dtDim; //dimentions of a sample int m_numCluster; //number of clusters you want to classify double** m_data; //dataset of samples /*E step use*/ double** m_weight; //estimation of probility of each sample that blongs to each class /*M step use*/ double* m_p; //probability of each class which cosumes the whole samples double** m_mean; //mean of each class' samples double*** m_covariance; //covariance of each class int m_maxiter; //for the max iterate times }; #endif 实现文件: #include <iostream> #include <assert.h> #include <math.h> #include <string.h> #include <cstdlib> #include "MyGmm.h" /*Train interface*/ void MyGmm::Train(double** data, int dtNum, int dtDim, int numCluster, int maxiter){ assert(data!=0 && dtNum>0 && dtDim>0 && numCluster>0 && maxiter>0); AllocateTempSpace(data, dtNum, dtDim, numCluster, maxiter); InitRandomCluster(); Iterate(); } /*Allocate temp space for compute use*/ bool MyGmm::AllocateTempSpace(double** data, int dtNum, int dtDim, int numCluster, int maxiter){ m_data = data; m_dtNum = dtNum; m_dtDim = dtDim; m_numCluster = numCluster; m_maxiter = maxiter; m_weight = new double*[m_dtNum]; int i = 0; while( i<m_dtNum ){ m_weight[i] = new double[m_numCluster]; memset(m_weight[i], 0, sizeof(double)*m_numCluster); i++; } m_p = new double[m_numCluster]; memset(m_p, 0, sizeof(double)*m_numCluster); m_mean = new double*[m_numCluster]; i = 0; while( i<m_numCluster ){ m_mean[i] = new double[m_dtDim]; memset(m_mean[i], 0, sizeof(double)*m_dtDim); i++; } m_covariance = new double**[m_numCluster]; for(i=0; i<m_numCluster; i++){ m_covariance[i] = new double*[m_dtDim]; int j = 0; while( j<m_dtDim ){ m_covariance[i][j] = new double[m_dtDim]; memset(m_covariance[i][j], 0, sizeof(double)*m_dtDim); j++; } } return true; } /*Initialize clusters*/ void MyGmm::InitRandomCluster(){ int* randomClass = new int[m_dtNum]; //random class matrix int i; /*initial E step*/ for(i=0; i<m_dtNum; i++){ randomClass[i] = rand() % m_numCluster; std::cout<<randomClass[i]<<" "; } std::cout<<std::endl; /*initial M step,compute m_p,m_mean,m_convariance*/ for(i=0; i<m_dtNum; i++){ m_p[randomClass[i]]++; //count number of samples for each class for(int j=0; j<m_dtDim; j++){ m_mean[randomClass[i]][j] += m_data[i][j]; //sum of each class's samples } } /*mean of each class' samples, m_mean*/ for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtDim; j++){ m_mean[i][j] /= m_p[i]; } } /*covariance of each class, m_covariance*/ double* temp = new double[m_dtDim]; memset(temp, 0, sizeof(double)*m_dtDim); for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtNum; j++){ for(int k=0; k<m_dtDim; k++){ temp[k] = m_data[j][k] - m_mean[randomClass[j]][k]; } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[randomClass[j]][q][p] += temp[q]*temp[p]; } } } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[i][q][p] /= m_p[i]; } } } /*probility of each sample that blongs to each class, m_p*/ for(i=0; i<m_numCluster; i++){ m_p[i] /= m_dtNum; } /*release memory*/ delete[] temp; delete[] randomClass; } /*Iterate clustering*/ void MyGmm::Iterate(){ bool loop = true; int iter = 1; while(loop){ ComputeEstimationWeight(); ComputeMaximizationCondition(); iter++; /* Add your condition here!*/ if(iter>m_maxiter) loop = false; } } /*Compute estimation weight*/ void MyGmm::ComputeEstimationWeight(){ int i; double* each_dis = new double[m_numCluster]; //for compute each sample's distribution probility in each Gaussion use for(i=0; i<m_dtNum; i++){ memset(each_dis, 1, sizeof(double)*m_numCluster); double full_dis = 0; //for compute each sample's full distribution use /*compute each sample's distribution probability in each Gaussion*/ for(int j=0; j<m_numCluster; j++){ double all_cov = 1.0; for(int k=0; k<m_dtDim; k++){ /* just regard each dimention of a sample is irrelavant if you want to compute more specificly, for example they are some sort of relavant, you can change the following expression*/ all_cov *= m_covariance[j][k][k]; each_dis[j] *= exp(-0.5*(m_data[i][k]-m_mean[j][k])*(m_data[i][k]-m_mean[j][k])/m_covariance[j][k][k]); } each_dis[j] *= 1 / sqrt(2 * 3.14159 * all_cov); full_dis += each_dis[j]*m_p[j]; } for(int k=0; k<m_numCluster; k++){ m_weight[i][k] = (each_dis[k]*m_p[k]) / full_dis; } } delete[] each_dis; } /*Compute maximization condition according to estimation weight*/ void MyGmm::ComputeMaximizationCondition(){ int i; /*compute the new m_p*/ SetM_PZero(); double total = 0.0; for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtNum; j++){ m_p[i] += m_weight[j][i]; } total += m_p[i]; } /*compute the new m_mean*/ SetM_MeanZero(); for(int k=0; k<m_numCluster; k++){ for(int l=0; l<m_dtNum; l++){ for(int m=0; m<m_dtDim; m++){ m_mean[k][m] += m_weight[l][k]*m_data[l][m]; } } for(int m=0; m<m_dtDim; m++){ m_mean[k][m] /= m_p[k]; } } /*compute the m_covariance*/ double* temp = new double[m_dtDim]; memset(temp, 0, sizeof(double)*m_dtDim); SetM_CovrianceZero(); for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtNum; j++){ for(int k=0; k<m_dtDim; k++){ temp[k] = m_data[j][k] - m_mean[i][k]; } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[i][q][p] += m_weight[j][i]*temp[q]*temp[p]; } } } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[i][q][p] /= m_p[i]; } } } /*standarize the m_p*/ for(int j=0; j<m_numCluster; j++){ m_p[j] /= total; } /*release memory*/ delete[] temp; } /*Delete those temp space*/ bool MyGmm::Dispose(){ if(m_weight!=0){ int i = 0; while( i<m_dtNum ){ delete[] m_weight[i]; i++; } delete[] m_weight; } if(m_p!=0){ delete[] m_p; } if(m_mean!=0){ int i = 0; while( i<m_numCluster ){ delete[] m_mean[i]; i++; } delete[] m_mean; } if(m_covariance!=0){ int i = 0; while( i<m_numCluster ){ int j = 0; while( j<m_dtDim ){ delete[] m_covariance[i][j]; j++; } i++; } for(i=0; i<m_numCluster; i++){ delete[] m_covariance[i] } delete[] m_covariance; } return true; } void MyGmm::SetM_CovrianceZero(){ for(int i=0; i<m_numCluster; i++){ int j = 0; while( j<m_dtDim ){ memset(m_covariance[i][j], 0.0, sizeof(double)*m_dtDim); j++; } } } /*Set those temp space to zero*/ void MyGmm::SetM_PZero(){ memset(m_p, 0.0, sizeof(double)*m_numCluster); } void MyGmm::SetM_WeightZero(){ int i = 0; while( i<m_dtNum ){ memset(m_weight[i], 0.0, sizeof(double)*m_numCluster); i++; } } void MyGmm::SetM_MeanZero(){ int i = 0; while( i<m_numCluster ){ memset(m_mean[i], 0.0, sizeof(double)*m_dtDim); i++; } } /*Show data in those temp space, also for debug use*/ void MyGmm::ShowWeight(){ for(int i=0; i<m_dtNum; i++){ for(int j=0; j<m_numCluster; j++){ std::cout<<m_weight[i][j]<<" "; } std::cout<<std::endl; } } void MyGmm::ShowP(){ for(int i=0; i<m_numCluster; i++) std::cout<<m_p[i]<<" "; std::cout<<"\n"; } void MyGmm::ShowMean(){ for(int i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtDim; j++){ std::cout<<m_mean[i][j]<<" "; } std::cout<<"\n"; } } void MyGmm::ShowCovariance(){ for(int i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtDim; j++){ for(int k=0; k<m_dtDim; k++){ std::cout<<m_covariance[i][j][k]<<" "; } std::cout<<"\n"; } std::cout<<"\n"; } } void MyGmm::ShowData(){ for(int i=0; i<m_dtNum; i++){ for(int j=0; j<m_dtDim; j++){ std::cout<<m_data[i][j]<<" "; } std::cout<<std::endl; } } 测试文件: #include <iostream> #include <fstream> #include "MyGmm.h" using namespace std; int main() { double** test_data1 = new double*[10]; int i; for(i=0; i<10; i++) test_data1[i] = new double[3]; test_data1[0][0] = 8; test_data1[0][1] = 4; test_data1[0][2] = 9; test_data1[1][0]=2; test_data1[1][1]=3; test_data1[1][2]=5; test_data1[2][0]=7; test_data1[2][1]=6; test_data1[2][2]=1; test_data1[3][0]=45; test_data1[3][1]=10; test_data1[3][2]=12; test_data1[4][0]=14; test_data1[4][1]=18; test_data1[4][2]=7; test_data1[5][0]=1; test_data1[5][1]=18; test_data1[5][2]=22; test_data1[6][0]=4; test_data1[6][1]=8; test_data1[6][2]=17; test_data1[7][0]=11; test_data1[7][1]=18; test_data1[7][2]=27; test_data1[8][0]=24; test_data1[8][1]=18; test_data1[8][2]=27; test_data1[9][0]=19; test_data1[9][1]=18; test_data1[9][2]=32; MyGmm mg; mg.Train(test_data1, 10, 3, 3, 7); mg.ShowWeight(); for(i=0; i<5; i++) delete[] test_data1[i]; delete[] test_data1; return 0; }