高斯混合模型 EM算法 聚类问题 Python实现

初始化模型参数 { ( α i , μ i , Σ i ) ∣ 1 ≤ i ≤ k } \{(\alpha_i, \mu_i, \Sigma_i) | 1 ≤ i ≤ k\} {(αi,μi,Σi)1ik},迭代公式如下:

γ j i = α i ⋅ p ( x j ∣ μ i , Σ i ) ∑ l = 1 k α l ⋅ p ( x j ∣ μ l , Σ l ) \gamma_{ji} = \frac{\alpha_i \cdot p(x_j | \mu_i, \Sigma_i)}{\sum_{l = 1}^k \alpha_l \cdot p(x_j | \mu_l, \Sigma_l)} γji=l=1kαlp(xjμl,Σl)αip(xjμi,Σi)
μ i ′ = ∑ j = 1 m γ j i x j ∑ j = 1 m γ j i \mu'_i = \frac{\sum_{j = 1}^m \gamma_{ji} x_j}{\sum_{j = 1}^m \gamma_{ji}} μi=j=1mγjij=1mγjixj
Σ i ′ = ∑ j = 1 m γ j i ( x j − μ i ′ ) ( x j − μ i ′ ) T ∑ j = 1 m γ j i \Sigma'_i = \frac{\sum_{j = 1}^m \gamma_{ji} (x_j - \mu'_i)(x_j - \mu'_i)^T}{\sum_{j = 1}^m \gamma_{ji}} Σi=j=1mγjij=1mγji(xjμi)(xjμi)T
α i ′ = ∑ j = 1 m γ j i m \alpha'_i = \frac{\sum_{j = 1}^m \gamma_{ji}}{m} αi=mj=1mγji

简化起见,考虑一个例子,对 50 个样本聚类,样本维度为 2,代码如下:

import numpy
from scipy.stats import multivariate_normal
from matplotlib import pyplot

x = [[0.697, 0.460], [0.774, 0.376], [0.634, 0.264], [0.608, 0.318], [0.556, 0.215],
     [0.403, 0.237], [0.481, 0.149], [0.437, 0.211], [0.666, 0.091], [0.243, 0.267],
     [0.245, 0.057], [0.343, 0.099], [0.639, 0.161], [0.657, 0.198], [0.360, 0.370],
     [0.593, 0.042], [0.719, 0.103], [0.359, 0.188], [0.339, 0.241], [0.282, 0.257],
     [0.748, 0.232], [0.714, 0.346], [0.483, 0.312], [0.478, 0.437], [0.525, 0.369],
     [0.751, 0.489], [0.532, 0.472], [0.473, 0.376], [0.725, 0.445], [0.446, 0.459]]

def GMM_EM(x, k, n):
	def MaxIndex(x):
		m = 0
		for i in range(1, len(x)):
			if x[m] < x[i]: m = i
		return m
	m = len(x)
	alpha, mu, Sigma = [], [numpy.array(x[5]), numpy.array(x[21]), numpy.array(x[26])], []
	for _ in range(k):
		alpha.append(1.0 / k)
		Sigma.append(numpy.array([[0.1, 0.0], [0.0, 0.1]]))
	pfig = pyplot.figure(num = 'Gaussian Mixture Model'); pc = ['r', 'g', 'b']

	for l in range(n):
		gamma = [[] for _ in range(m)]
		for j in range(m):
			pxj, sapxj = [], 0.0
			for i in range(k):
				pxj.append(multivariate_normal.pdf(x[j], mean = mu[i], cov = Sigma[i]))
				sapxj += alpha[i] * pxj[i]
			for i in range(k):
				gamma[j].append(alpha[i] * pxj[i] / sapxj)

		p = pfig.add_subplot(5, 10, l + 1)
		p.set_title(str(l)), p.axis('off')
		px, py = [[] for _ in range(k)], [[] for _ in range(k)]
		for j in range(m):
			i = MaxIndex(gamma[j])
			px[i].append(x[j][0]), py[i].append(x[j][1])
		for i in range(k):
			p.scatter(px[i], py[i], c = pc[i], marker = 'o')
			p.scatter(mu[i][0], mu[i][1], c = 'k', marker = 'x')

		for i in range(k):
			alpha[i], mu[i], Sigma[i] = 0.0, numpy.array([0.0, 0.0]), numpy.array([[0.0, 0.0], [0.0, 0.0]])
			for j in range(m):
				alpha[i] += gamma[j][i]
				mu[i] += gamma[j][i] * numpy.array(x[j])
			mu[i] /= alpha[i]
			for j in range(m):
				Sigma[i] += gamma[j][i] * numpy.outer(numpy.array(x[j]) - mu[i], numpy.array(x[j]) - mu[i])
			Sigma[i] /= alpha[i]
			alpha[i] /= m
	pyplot.show()
	return (alpha, mu, Sigma)

GMM_EM(x, 3, 50)

你可能感兴趣的:(算法)