初始化模型参数 { ( α i , μ i , Σ i ) ∣ 1 ≤ i ≤ k } \{(\alpha_i, \mu_i, \Sigma_i) | 1 ≤ i ≤ k\} {(αi,μi,Σi)∣1≤i≤k},迭代公式如下:
γ j i = α i ⋅ p ( x j ∣ μ i , Σ i ) ∑ l = 1 k α l ⋅ p ( x j ∣ μ l , Σ l ) \gamma_{ji} = \frac{\alpha_i \cdot p(x_j | \mu_i, \Sigma_i)}{\sum_{l = 1}^k \alpha_l \cdot p(x_j | \mu_l, \Sigma_l)} γji=∑l=1kαl⋅p(xj∣μl,Σl)αi⋅p(xj∣μi,Σi)
μ i ′ = ∑ j = 1 m γ j i x j ∑ j = 1 m γ j i \mu'_i = \frac{\sum_{j = 1}^m \gamma_{ji} x_j}{\sum_{j = 1}^m \gamma_{ji}} μi′=∑j=1mγji∑j=1mγjixj
Σ i ′ = ∑ j = 1 m γ j i ( x j − μ i ′ ) ( x j − μ i ′ ) T ∑ j = 1 m γ j i \Sigma'_i = \frac{\sum_{j = 1}^m \gamma_{ji} (x_j - \mu'_i)(x_j - \mu'_i)^T}{\sum_{j = 1}^m \gamma_{ji}} Σi′=∑j=1mγji∑j=1mγji(xj−μi′)(xj−μi′)T
α i ′ = ∑ j = 1 m γ j i m \alpha'_i = \frac{\sum_{j = 1}^m \gamma_{ji}}{m} αi′=m∑j=1mγji
简化起见,考虑一个例子,对 50 个样本聚类,样本维度为 2,代码如下:
import numpy
from scipy.stats import multivariate_normal
from matplotlib import pyplot
x = [[0.697, 0.460], [0.774, 0.376], [0.634, 0.264], [0.608, 0.318], [0.556, 0.215],
[0.403, 0.237], [0.481, 0.149], [0.437, 0.211], [0.666, 0.091], [0.243, 0.267],
[0.245, 0.057], [0.343, 0.099], [0.639, 0.161], [0.657, 0.198], [0.360, 0.370],
[0.593, 0.042], [0.719, 0.103], [0.359, 0.188], [0.339, 0.241], [0.282, 0.257],
[0.748, 0.232], [0.714, 0.346], [0.483, 0.312], [0.478, 0.437], [0.525, 0.369],
[0.751, 0.489], [0.532, 0.472], [0.473, 0.376], [0.725, 0.445], [0.446, 0.459]]
def GMM_EM(x, k, n):
def MaxIndex(x):
m = 0
for i in range(1, len(x)):
if x[m] < x[i]: m = i
return m
m = len(x)
alpha, mu, Sigma = [], [numpy.array(x[5]), numpy.array(x[21]), numpy.array(x[26])], []
for _ in range(k):
alpha.append(1.0 / k)
Sigma.append(numpy.array([[0.1, 0.0], [0.0, 0.1]]))
pfig = pyplot.figure(num = 'Gaussian Mixture Model'); pc = ['r', 'g', 'b']
for l in range(n):
gamma = [[] for _ in range(m)]
for j in range(m):
pxj, sapxj = [], 0.0
for i in range(k):
pxj.append(multivariate_normal.pdf(x[j], mean = mu[i], cov = Sigma[i]))
sapxj += alpha[i] * pxj[i]
for i in range(k):
gamma[j].append(alpha[i] * pxj[i] / sapxj)
p = pfig.add_subplot(5, 10, l + 1)
p.set_title(str(l)), p.axis('off')
px, py = [[] for _ in range(k)], [[] for _ in range(k)]
for j in range(m):
i = MaxIndex(gamma[j])
px[i].append(x[j][0]), py[i].append(x[j][1])
for i in range(k):
p.scatter(px[i], py[i], c = pc[i], marker = 'o')
p.scatter(mu[i][0], mu[i][1], c = 'k', marker = 'x')
for i in range(k):
alpha[i], mu[i], Sigma[i] = 0.0, numpy.array([0.0, 0.0]), numpy.array([[0.0, 0.0], [0.0, 0.0]])
for j in range(m):
alpha[i] += gamma[j][i]
mu[i] += gamma[j][i] * numpy.array(x[j])
mu[i] /= alpha[i]
for j in range(m):
Sigma[i] += gamma[j][i] * numpy.outer(numpy.array(x[j]) - mu[i], numpy.array(x[j]) - mu[i])
Sigma[i] /= alpha[i]
alpha[i] /= m
pyplot.show()
return (alpha, mu, Sigma)
GMM_EM(x, 3, 50)