高斯混合模型(GMM)的EM算法求解 python实现

import numpy as np
from scipy.stats import norm

def gmm_em(X ,K=2, max_iter=100, epsilon=1e-4, T = 5):
    N = len(X)
    print('-----EM optimization for GMM-----')
    # EM算法只能保证收敛到稳定点,不能保证收敛至极大值点
    # 常用的办法是进行几次迭代,对结果做比较,并选择最好
    Mu = []
    Sigma = []
    for t in range(T):
        # 初始化
        alpha = np.ones([K, 1]) /K # 权重
        mu = np.ones([K, 1]) # 均值
        sigma = np.ones([K, 1]) # 标准差
        for k in range(K):
            n = np.random.randint(K*3, K*30)
            x = np.random.choice(X, n)
            mu[k][0] = np.mean(x)
            sigma[k][0] = np.std(x, ddof = 1)
        alpha_, mu_, sigma_ = alpha, mu, sigma

        for _iter in range(max_iter):
            # expectation 隐变量估计
            gamma =  np.zeros([K, N])
            for k in range(K):
                gamma[k] = alpha[k] * norm.pdf(X, loc=mu[k][0], scale=sigma[k][0])
            s = np.sum(gamma, axis=0, keepdims=True)
            s[s==0] = np.inf
            gamma /= s

            # maximization 更新参数
            mu_ = np.sum(gamma*X, axis=1, keepdims=True) / np.sum(gamma, axis=1, keepdims=True)
            sigma_ = np.sqrt( np.sum(gamma*(X-mu)**2, axis=1, keepdims=True) / np.sum(gamma, axis=1, keepdims=True) )
            alpha_ = np.sum(gamma, axis=1, keepdims=True) / N

            # 计算Q函数
            q = np.zeros([K,])
            for k in range(K):
                joint_dist = np.clip(alpha[k]*norm.pdf(X, loc=mu[k][0], scale=sigma[k][0]), 1e-300, 1)
                q[k] = np.sum( gamma[k]*np.log( joint_dist ) )
            Q = np.sum(q)
            q_ = np.zeros([K,])
            for k in range(K):
                joint_dist_ = np.clip(alpha_[k]*norm.pdf(X, loc=mu_[k][0], scale=sigma_[k][0]), 1e-300, 1)
                q_[k] = np.sum( gamma[k]*np.log( joint_dist_ ) )
            Q_ = np.sum(q_)

            # 对数似然函数
            # L = np.zeros([K, N])
            # for k in range(K):
            #     L[k] = alpha_[k] * norm.pdf(X, loc=mu_[k][0], scale=sigma_[k][0])
            # L = np.sum(L, axis=0)
            # L = np.sum(np.log(np.clip(L, 1e-300, 1)))

            # if (_iter+1)%10 == 0:
            print('T:', t+1, '  Iteration:', _iter+1, '  Q value:', Q_)
            alpha = alpha_
            mu = mu_
            sigma = sigma_

            # 判断是否收敛
            if np.abs(Q_-Q) < epsilon:
                break
        
        idx = mu[:,0].argsort()
        mu = mu[idx]
        sigma = sigma[idx]
        Mu.append(mu)
        Sigma.append(sigma)

    Mu = np.concatenate(Mu, axis=1)
    Sigma = np.concatenate(Sigma, axis=1)

    # 剔除离群值
    index = np.array([True]*T)
    for _ in range(int(T/K)):
        Mu_ = Mu[:,index]
        mean = np.mean(Mu_, axis=1, keepdims=True)
        Mu_centered = np.abs( Mu_ - mean)
        idx = np.argmax(Mu_centered, axis=1)
        index[idx] = False
    
    # 取均值
    Mu = np.mean(Mu[:,index], axis=1)
    Sigma = np.mean(Sigma[:,index], axis=1)
    return Mu, Sigma

if __name__ == '__main__':

    X1 = norm.rvs(loc=5, scale=10, size=7000)
    X2 = norm.rvs(loc=65, scale=16, size=3000)
    X = np.concatenate([X1, X2])
    np.random.shuffle(X)

    mu1, sigma1 = np.mean(X1), np.std(X1)
    mu2, sigma2 = np.mean(X2), np.std(X2)
    mu, sigma = gmm_em(X)

    print(end='') # 可以在这行打断点调试,比较结果

参考资料:

《统计学习方法》李航

你可能感兴趣的:(python,算法,机器学习)