我们使用k个多元高斯分布的混合高斯分布GMM来对数据进行聚类,其中每一个分布代表一个数据簇。首先,随机选择k个对象代表各个簇的均值(中心),猜测每一个簇的协方差矩阵,并假定初始状态 时每个簇的概率相等; 然后,根据多元高斯密度函数求出每一个对象属于每一个簇的概率,并求出数据的似然函数值;最后,根据每一个数据点属于每一个簇的概率,来更新每一个簇的均值,协方差矩阵,和每一个簇的概率。不断迭代以上两步,直到算法收敛。 这时我们根据每一个对象属于每一个簇的概率,将对象指派的概率最高的簇中。
关键部分就是EM算法部分。
算法中只知道每个向量的坐标,要将这些向量聚类为k个gauss分布中,但这k个cluster的高斯分布的参数未知,当然另一个未知量是各个向量所归属的类别。
首先初始化各个gauss分布的参数
E-step:
根据这k个gauss分布的参数求得各个向量归属各个cluster的概率矩阵
M-step:
根据E-step概率矩阵更新gauss分布参数
以上两步交替进行,直到收敛。
这样看来GMM聚类和k均值聚类是不是有些像。不过k均值给出的结果要么属于这一类,要么属于那一类,GMM给出的则是软性的。
下面是我的实现,先用kmeans初始化了聚类中心。
#include "stdafx.h" #include<set> #include<vector> #include<cstdlib> #include<time.h> #include<iostream> using namespace std; #define PI 3.1415926 class GMM; class kmeans { friend class GMM; private: double**dataset; int datanums; unsigned int k; unsigned int dim; typedef vector<double> Centroid; vector<Centroid> center; vector<set<int>>cluster_ID; vector<Centroid>new_center; vector<set<int>>new_cluster_ID; double threshold; private: void init(); void assign(); double distance(Centroid cen, int k2); void split(vector<set<int>>&clusters, int kk); void update_centers(); bool isfinish(); public: kmeans() { threshold = 0.0001; } void apply(double**data, int datanum, int numofcluster, int dim); }; //template <typename T> void kmeans::init() { center.resize(k); set<int>bb; for (int i = 0; i < k; i++) { int id = double(rand()) / double(RAND_MAX + 1.0)*datanums; while (bb.find(id) != bb.end()) { id = double(rand()) / double(RAND_MAX + 1.0)*datanums; } bb.insert(id); center[i].resize(dim); for (int j = 0; j < dim; j++) center[i][j] = dataset[id][j]; } } bool kmeans::isfinish() { double error = 0; for (int i = 0; i < k; i++) { for (int j = 0; j < dim; j++) error += pow(center[i][j] - new_center[i][j], 2); } return error < threshold ? true : false; } void kmeans::assign() { for (int j = 0; j < datanums; j++) { double mindis = 10000000; int belongto = -1; for (int i = 0; i < k; i++) { double dis = distance(center[i], j); if (dis < mindis) { mindis = dis; belongto = i; } } new_cluster_ID[belongto].insert(j); } for (int i = 0; i < k; i++) { if (new_cluster_ID[i].empty()) { split(new_cluster_ID, i); } } } double kmeans::distance(Centroid cen, int k2) { double dis = 0; for (int i = 0; i < dim; i++) dis += pow(cen[i] - dataset[k2][i], 2); return sqrt(dis); } void kmeans::split(vector<set<int>>&clusters, int kk) { int maxsize = 0; int th = -1; for (int i = 0; i < k; i++) { if (clusters[i].size() > maxsize) { maxsize = clusters[i].size(); th = i; } } #define DELTA 1 vector<double>tpc1, tpc2; tpc1.resize(dim); tpc2.resize(dim); for (int i = 0; i < dim; i++) { tpc2[i] = center[th][i] - DELTA; tpc1[i] = center[th][i] + DELTA; } for (set<int>::iterator it = clusters[th].begin(); it != clusters[th].end(); it++) { double d1 = distance(tpc1, *it); double d2 = distance(tpc2, *it); if (d2 < d1) { clusters[kk].insert(*it); } } _ASSERTE(!clusters[kk].empty()); for (set<int>::iterator it = clusters[kk].begin(); it != clusters[kk].end(); it++) clusters[th].erase(*it); } void kmeans::update_centers() { for (int i = 0; i < k; i++) { Centroid temp; temp.resize(dim); for (set<int>::iterator j = new_cluster_ID[i].begin(); j != new_cluster_ID[i].end(); j++) { for (int m = 0; m < dim; m++) temp[m] += dataset[*j][m]; } for (int m = 0; m < dim; m++) temp[m] /= new_cluster_ID[i].size(); new_center[i] = temp; } } void kmeans::apply(double**data, int datanum, int numofcluster, int dim) { this->dim = dim; datanums = datanum; dataset = data; k = numofcluster; init(); new_center.resize(k); new_cluster_ID.resize(k); assign(); update_centers(); int iter = 0; while (!isfinish()) { center = new_center; cluster_ID = new_cluster_ID; new_center.clear(); new_center.resize(k); new_cluster_ID.clear(); new_cluster_ID.resize(k); assign(); update_centers(); iter++; } } class GMM { /** * 待分类的向量的个数 */ private: int numVec; /** * 向量的维数 */ int numDim; /** * 聚类的数目 */ int numClusters; /** * 最大迭代次数 */ int maxIteration = 500; /** * 待聚类的向量数组 */ double** data; /** * 第i个向量属于第j类的概率 */ double** probabilities; /** * 每一个类的均值向量 */ double** uVectors; /** * 每一个类的先验概率 */ double* priorProb; /** * 每一个类的协方差矩阵,用于计算n维正态随机变量的概率密度 */ double*** convMatrix; /** * 聚类的结果,result[i]为第i个向量的类标 */ int* result; /** * 一个很小的数 */ const double SMALLNUMBER = 0.000000000000001; /** * 存储log likelihood函数值,其值在E-Step里进行计算,最终目标即要使该值最大 */ double log_likely; /** * 在高斯混合模型下用EM算法进行聚类,聚类结果存放于整数数组中 * * @param fdata * 待聚类的向量 * @param fnumClusters * 聚类的个数 * @return 返回向量的类标数组 */ public: ~GMM() { for (int i = 0; i < numVec; i++) { delete[]probabilities[i]; } for (int i = 0; i < numClusters; i++) { for (int j = 0; j < numDim; j++) delete[]convMatrix[i][j]; delete[]convMatrix[i]; delete[]uVectors[i]; } delete[]convMatrix; delete[]probabilities; delete[]priorProb; delete[]uVectors; } int* GMM_Cluster(double** fdata, int datanums, int dim, int fnumClusters) { initCluster(fdata, datanums, dim, fnumClusters); // 初始化 expectation(); // E步 maximization(); // M步 double l2 = log_likely; // 不断迭代直到收敛 int time = 1; do { l2 = log_likely; expectation(); maximization(); time++; } while (abs(l2 - log_likely) > SMALLNUMBER&&time < maxIteration); // 如果收敛过慢,可以适当调整迭代条件SMALLNUMBER for (int i = 0; i < numVec; i++) // 比较第i个向量属于各个类的概率,把第i个向量划入概率最大的那一类 { int temp = 0;// 第i个向量最大可能属于某类的类标 for (int j = 1; j < numClusters; j++) { if (probabilities[i][j] > probabilities[i][temp]) { temp = j; } } result[i] = temp; } return result; // 返回类标数组 } double**get_mean() { return uVectors; } /** * 求矩阵行列式 * * @param param * fconvMatrix 矩阵 * @return 返回矩阵的行列式 */ private: double determinant(double** fconvMatrix) { double det = 1.0; for (int i = 0; i < numDim; i++)// 由于协方差矩阵是对角矩阵,所以直接对角线相乘 { det = det * fconvMatrix[i][i]; } return det; // 返回协方差矩阵的行列式 } /** * 求矩阵的逆矩阵 * * @param fconvMatrix * 矩阵 * @return fconvMatrix的逆矩阵 */ double** inverse(double** fconvMatrix) { // 复制原矩阵 double** a = new double*[numDim]; for (int i = 0; i < numDim; i++) a[i] = new double[numDim]; for (int i = 0; i < numDim; i++) for (int j = 0; j < numDim; j++) a[i][j] = fconvMatrix[i][j]; for (int i = 0; i < numDim; i++) // 由于协方差矩阵是对角矩阵,所以直接将对角线的元素翻转 { a[i][i] = 1 / a[i][i]; } return a; // 返回协方差矩阵的逆矩阵 } /** * 求多维高斯分布概率密度 * * @param fvector * 多维空间中的点坐标,即要求该点上的概率密度 * @param fuVec * 高斯分布的均值向量 * @param fconvMatrix * 高斯分布的协方差矩阵 * @return 多维空间中对应点的概率密度 */ double gauss(double* fvector, double* fuVec, double** fconvMatrix) { double* temp1 = new double[numDim]; double* temp2 = new double[numDim]; for (int i = 0; i < numDim; i++) { temp1[i] = fvector[i] - fuVec[i]; // temp1存储(X-u)'向量 temp2[i] = 0.0; // 初始化temp2为0.0 } double** a = inverse(fconvMatrix);// a为协方差矩阵的逆矩阵 // 算exp函数的指数部分 for (int i = 0; i < numDim; i++) { temp2[i] = a[i][i] * temp1[i]; } for (int i = 0; i < numDim; i++) delete[]a[i]; delete[]a; double temp = 0.0; for (int i = 0; i < numDim; i++) { temp += temp1[i] * temp2[i]; } temp = temp / -2.0; // 求得exp函数的指数部分temp temp = exp(temp); double det = determinant(fconvMatrix);// det为协方差矩阵的行列式 double temp3; temp3 = temp / sqrt(pow(2 * PI, numDim) * det); // 计算出向量的概率密度为temp3 temp3 += SMALLNUMBER;// 加一个很小的数 return temp3; } /** * 做一些初始化工作 求初始聚类中心 */ void initCluster(double** fdata, int datanums, int dim, int fnumClusters) { numVec = datanums; // 初始化成员变量numVec(numVec是待分类向量的个数) numDim = dim; // 初始化成员变量numDim(numDim是向量的维数) numClusters = fnumClusters;// 初始化成员变量numClusters(numClusters是分类的数目) data = fdata; //利用kmeans做初始化 kmeans km; km.apply(data, datanums, fnumClusters, dim); // 初始化向量的概率矩阵为零矩阵(第i个向量属于第j类的概率为零) probabilities = new double*[numVec]; for (int i = 0; i < numVec; i++) { probabilities[i] = new double[numClusters]; memset(probabilities[i], 0, sizeof(double)*numClusters); } // 初始化每一个类的先验概率为相等,以后每次迭代的M步会不断求精 priorProb = new double[numClusters]; for (int i = 0; i < numClusters; i++) { priorProb[i] = 1.0 / (double)(numClusters); } // 初始化类标数组 result = new int[numVec]; //memset(result, 0, sizeof(int)*numVec); for (int i = 0; i < fnumClusters; i++) { for (set<int>::iterator it = km.cluster_ID[i].begin() ; it != km.cluster_ID[i].end(); it++) result[*it] = i; } // 初始化每一个类的均值向量,以后每次迭代的M步会不断求精 uVectors = new double*[numClusters]; for (int i = 0; i < numClusters; i++) uVectors[i] = new double[numDim]; int index; for (int k = 0; k < numClusters; k++) { //index = numVec*double(rand()) / (RAND_MAX + 1.0); //while ((int)(result[index]) == 1) { // index = numVec*double(rand()) / (RAND_MAX + 1.0); //} //result[index] = 1; for (int j = 0; j < numDim; j++) { //uVectors[k][j] = data[index][j]; uVectors[k][j] = km.center[k][j]; } } // 初始化每个类的协方差矩阵为单位矩阵,以后每次迭代的M步会不断求精 convMatrix = new double**[numClusters]; for (int i = 0; i < numClusters; i++) { convMatrix[i] = new double*[numDim]; for (int j = 0; j < numDim; j++) convMatrix[i][j] = new double[numDim]; } for (int i = 0; i < numClusters; i++) { for (int j = 0; j < numDim; j++) { for (int k = 0; k < numDim; k++) { if (j == k) convMatrix[i][j][k] = 100.01; else convMatrix[i][j][k] = 0; } } } } /** * EM算法的E-Step * */ void expectation() { for (int i = 0; i < numVec; i++) { log_likely = 0.0; // l为似然函数值 // 计算第i个向量的概率temp double temp = 0.0; for (int k = 0; k < numClusters; k++) { double g1 = gauss(data[i], uVectors[k], convMatrix[k]); temp += g1 * priorProb[k]; } // 计算第i个向量属于第j类的概率,计算公式见《Top 10 algorithms in data mining》的EM算法部分 for (int j = 0; j < numClusters; j++) { double g2 = gauss(data[i], uVectors[j], convMatrix[j]); probabilities[i][j] = priorProb[j] * g2 / temp; // 计算第i个向量属于第j类的概率 } // 计算log似然函数,其值的变化影响迭代次数 log_likely += log(temp); } } /** * EM算法的M-Step * */ void maximization() { for (int j = 0; j < numClusters; j++) { // 更新每个类的先验概率,计算公式见《Top 10 algorithms in data mining》的EM算法部分 double temp = 0.0; for (int i = 0; i < numVec; i++) { temp += probabilities[i][j]; } priorProb[j] = temp / (double)numVec; // 更新每一个类的均值向量,计算公式见《Top 10 algorithms in data mining》的EM算法部分 for (int k = 0; k < numDim; k++)// 先将第j类的均值向量清零,以便重新计算 { uVectors[j][k] = 0.0; } for (int i = 0; i < numVec; i++) { for (int k = 0; k < numDim; k++) { uVectors[j][k] += data[i][k] * probabilities[i][j]; } } for (int k = 0; k < numDim; k++) { uVectors[j][k] /= temp; } // 更新协方差矩阵,计算公式见《Top 10 algorithms in data mining》的EM算法部分 for (int k = 0; k < numDim; k++)// 先将协方差矩阵清零,以便重新计算 { convMatrix[j][k][k] = 0.0; } for (int i = 0; i < numVec; i++)// 重新计算协方差矩阵 { double* temp2 = new double[numDim]; for (int k = 0; k < numDim; k++) { temp2[k] = data[i][k] - uVectors[j][k]; } for (int k = 0; k < numDim; k++) { convMatrix[j][k][k] += probabilities[i][j] * temp2[k] * temp2[k]; } delete[]temp2; } for (int k = 0; k < numDim; k++) { convMatrix[j][k][k] = convMatrix[j][k][k] / temp; // convMatrix[j][k][k] += 0.000000000001;//加一个很小的数 } } } } ; int main(){ time_t t; srand(time(&t)); int datanums = 100; int dim = 2; int clusterNums = 5; double**data = new double*[datanums]; for (int i = 0; i < datanums; i++) { data[i] = new double[dim]; for (int j = 0; j < dim; j++) data[i][j] = double(rand()) / RAND_MAX * 500; } GMM gmm; int *result = gmm.GMM_Cluster(data, datanums, dim, clusterNums); //multimap<int, pair<double, double>>cluster; vector<vector<int>>clusters; clusters.resize(clusterNums); for (int i = 0; i < datanums; i++) { //cluster.insert(pair<int, pair<double, double>> // (result[i], pair<double, double>(data[i][0], data[i][1]))); clusters[result[i]].push_back(i); //cout << result[i] << endl; } for (int i = 0; i < clusterNums; i++) { cout << "第" << i << "个簇的中心为(" << gmm.get_mean()[i][0] << "," << gmm.get_mean()[i][1] << ")。包涵下列数据:" << endl; //multimap<int, pair<double, double>>::const_iterator cit = cluster.upper_bound(i); // 输出: pythonzone.com,python-zone.com //while (cit != cluster.end()) //{ // cout << "(" << cit->second.first << "," << cit->second.second<<")" << endl; // ++cit; //} for (int j = 0; j < clusters[i].size(); j++) cout << "(" << data[clusters[i][j]][0] << "," << data[clusters[i][j]][1] << "), "; cout << endl << endl; } for (int i = 0; i < datanums; i++) { delete[]data[i]; } delete[]data; delete[]result; system("pause"); return 0; }
下图是结果