高斯混合聚类是一种基于概率模型的聚类方法,采用多个高斯分布的线性组合来表示数据的聚类结构。通过对每个样本的多个高斯分布进行加权组合,该算法能够更灵活地适应不同形状的聚类。
{ ( μ 1 , Σ 1 , π 1 ) , ( μ 2 , Σ 2 , π 2 ) , . . . , ( μ k , Σ k , π k ) } \{(\mu_1, \Sigma_1, \pi_1), (\mu_2, \Sigma_2, \pi_2), ..., (\mu_k, \Sigma_k, \pi_k)\} {(μ1,Σ1,π1),(μ2,Σ2,π2),...,(μk,Σk,πk)}
迭代过程(EM算法):
γ i j = π i ⋅ N ( X j ∣ μ i , Σ i ) ∑ l = 1 k π l ⋅ N ( X j ∣ μ l , Σ l ) \gamma_{ij} = \frac{\pi_i \cdot \mathcal{N}(X_j | \mu_i, \Sigma_i)}{\sum_{l=1}^{k} \pi_l \cdot \mathcal{N}(X_j | \mu_l, \Sigma_l)} γij=∑l=1kπl⋅N(Xj∣μl,Σl)πi⋅N(Xj∣μi,Σi)
停止条件:
根据设定的停止条件,比如达到最大迭代轮数或模型参数的变化小于某一阈值。
簇划分:
根据得到的后验概率 γ i j \gamma_{ij} γij 确定每个样本的簇标记,将样本划入概率最大的簇中。
C i = { X j ∣ argmax i γ i j , 1 ≤ i ≤ k } C_i = \{X_j | \text{argmax}_i \gamma_{ij}, 1 \leq i \leq k\} Ci={Xj∣argmaxiγij,1≤i≤k}
输出:
返回最终的簇划分 C = { C 1 , C 2 , . . . , C k } C = \{C_1, C_2, ..., C_k\} C={C1,C2,...,Ck}。
高斯混合聚类采用了迭代优化的方式,通过不断更新均值向量、协方差矩阵和混合系数,使得模型对数据的拟合更好。EM算法的E步骤计算后验概率,M步骤更新模型参数,整个过程不断迭代直至满足停止条件。最后,将每个样本划分到概率最大的簇中。
conda create -n ML python==3.9
conda activate ML
conda install scikit-learn matplotlib
软件包 | 本实验版本 |
---|---|
matplotlib | 3.5.2 |
numpy | 1.21.5 |
python | 3.9.13 |
scikit-learn | 1.0.2 |
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from sklearn.datasets import load_iris
DEBUG = True
def debug(*args, **kwargs):
global DEBUG
if DEBUG:
print(*args, **kwargs)
def phi(Y, mu_k, cov_k):
# Check for and handle infinite or NaN values in Y
norm = multivariate_normal(mean=mu_k, cov=cov_k)
return norm.pdf(Y)
def getExpectation(Y, mu, cov, alpha):
N = Y.shape[0]
K = alpha.shape[0]
assert N > 1, "There must be more than one sample!"
assert K > 1, "There must be more than one gaussian model!"
gamma = np.mat(np.zeros((N, K)))
prob = np.zeros((N, K))
for k in range(K):
prob[:, k] = phi(Y, mu[k], cov[k]) * alpha[k]
prob = np.mat(prob)
for k in range(K):
gamma[:, k] = prob[:, k] / np.sum(prob, axis=1)
return gamma
gamma
用于存储响应度。phi
函数计算),然后乘以该簇的混合系数。gamma
。def maximize(Y, gamma):
N, D = Y.shape
K = gamma.shape[1]
mu = np.zeros((K, D))
cov = []
alpha = np.zeros(K)
for k in range(K):
Nk = np.sum(gamma[:, k])
mu[k, :] = np.sum(np.multiply(Y, gamma[:, k]), axis=0) / Nk
diff = Y - mu[k]
cov_k = np.dot(diff.T, np.multiply(diff, gamma[:, k])) / Nk
cov_k += 1e-6 * np.identity(D) # Adding a small value to the diagonal for stability
cov.append(cov_k)
alpha[k] = Nk / N
cov = np.array(cov)
return mu, cov, alpha
mu
、协方差矩阵列表 cov
和混合系数 alpha
。def scale_data(Y):
for i in range(Y.shape[1]):
max_ = Y[:, i].max()
min_ = Y[:, i].min()
Y[:, i] = (Y[:, i] - min_) / (max_ - min_)
debug("Data scaled.")
return Y
def init_params(shape, K):
N, D = shape
mu = np.random.rand(K, D)
cov = np.array([np.eye(D)] * K)
alpha = np.array([1.0 / K] * K)
debug("Parameters initialized.")
debug("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n")
return mu, cov, alpha
def GMM_EM(Y, K, times):
Y = scale_data(Y)
mu, cov, alpha = init_params(Y.shape, K)
for i in range(times):
gamma = getExpectation(Y, mu, cov, alpha)
mu, cov, alpha = maximize(Y, gamma)
debug("{sep} Result {sep}".format(sep="-" * 20))
debug("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n")
return mu, cov, alpha
if __name__ == '__main__':
# Load Iris dataset
iris = load_iris()
Y = iris.data
# Model parameters
K = 3 # number of clusters
iterations = 100
# Run GMM EM algorithm
mu, cov, alpha = GMM_EM(Y, K, iterations)
# Clustering based on the trained model
N = Y.shape[0]
gamma = getExpectation(Y, mu, cov, alpha)
category = gamma.argmax(axis=1).flatten().tolist()[0]
# Plotting the results
for i in range(K):
cluster_data = np.array([Y[j] for j in range(N) if category[j] == i])
plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {i + 1}')
plt.legend()
plt.title("GMM Clustering By EM Algorithm")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from sklearn.datasets import load_iris
DEBUG = True
def debug(*args, **kwargs):
global DEBUG
if DEBUG:
print(*args, **kwargs)
def phi(Y, mu_k, cov_k):
# Check for and handle infinite or NaN values in Y
norm = multivariate_normal(mean=mu_k, cov=cov_k)
return norm.pdf(Y)
def getExpectation(Y, mu, cov, alpha):
N = Y.shape[0]
K = alpha.shape[0]
assert N > 1, "There must be more than one sample!"
assert K > 1, "There must be more than one gaussian model!"
gamma = np.mat(np.zeros((N, K)))
prob = np.zeros((N, K))
for k in range(K):
prob[:, k] = phi(Y, mu[k], cov[k]) * alpha[k]
prob = np.mat(prob)
for k in range(K):
gamma[:, k] = prob[:, k] / np.sum(prob, axis=1)
return gamma
def maximize(Y, gamma):
N, D = Y.shape
K = gamma.shape[1]
mu = np.zeros((K, D))
cov = []
alpha = np.zeros(K)
for k in range(K):
Nk = np.sum(gamma[:, k])
mu[k, :] = np.sum(np.multiply(Y, gamma[:, k]), axis=0) / Nk
diff = Y - mu[k]
cov_k = np.dot(diff.T, np.multiply(diff, gamma[:, k])) / Nk
cov_k += 1e-6 * np.identity(D) # Adding a small value to the diagonal for stability
cov.append(cov_k)
alpha[k] = Nk / N
cov = np.array(cov)
return mu, cov, alpha
def scale_data(Y):
for i in range(Y.shape[1]):
max_ = Y[:, i].max()
min_ = Y[:, i].min()
Y[:, i] = (Y[:, i] - min_) / (max_ - min_)
debug("Data scaled.")
return Y
def init_params(shape, K):
N, D = shape
mu = np.random.rand(K, D)
cov = np.array([np.eye(D)] * K)
alpha = np.array([1.0 / K] * K)
debug("Parameters initialized.")
debug("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n")
return mu, cov, alpha
def GMM_EM(Y, K, times):
Y = scale_data(Y)
mu, cov, alpha = init_params(Y.shape, K)
for i in range(times):
gamma = getExpectation(Y, mu, cov, alpha)
mu, cov, alpha = maximize(Y, gamma)
debug("{sep} Result {sep}".format(sep="-" * 20))
debug("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n")
return mu, cov, alpha
if __name__ == '__main__':
# Load Iris dataset
iris = load_iris()
Y = iris.data
# Model parameters
K = 3 # number of clusters
iterations = 100
# Run GMM EM algorithm
mu, cov, alpha = GMM_EM(Y, K, iterations)
# Clustering based on the trained model
N = Y.shape[0]
gamma = getExpectation(Y, mu, cov, alpha)
category = gamma.argmax(axis=1).flatten().tolist()[0]
# Plotting the results
for i in range(K):
cluster_data = np.array([Y[j] for j in range(N) if category[j] == i])
plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {i + 1}')
plt.legend()
plt.title("GMM Clustering By EM Algorithm")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()