原理大致思路就是把数据建立成k个高斯分布,EM迭代N次。最后看每个点在哪个高斯分布的概率最高,就分到那个分布。
computeOmega函数,用来算第i个样本属于第j个分类的概率$\omega_j^{(i)}$$,其中
ω j ( i ) \omega_j^{(i)} ωj(i)`计算如下:
  ω j ( i ) = Q i ( z ( i ) = j ) = p ( z ( i ) = j ∣ x ( i ) ; ϕ , μ , Σ ) \,\omega_j^{(i)}=Q_i(z^{(i)}=j)=p(z^{(i)}=j|x^{(i)};\phi,\mu,\Sigma) ωj(i)=Qi(z(i)=j)=p(z(i)=j∣x(i);ϕ,μ,Σ)
= ϕ j N ( x ( i ) ∣ μ j , Σ j ) ∑ j = 1 K N ( x ( i ) ∣ μ j , Σ j ) \qquad=\frac{\phi_j N(x^{(i)}|\mu_j,\Sigma_j)}{\sum\limits_{j=1}^K N(x^{(i)}|\mu_j,\Sigma_j)} =j=1∑KN(x(i)∣μj,Σj)ϕjN(x(i)∣μj,Σj)
computeOmega函数中:
X的shape是(n_samples,n_features),输入数据
mu的shape是(n_clusters,n_features),均值
sigma的shape是(n_clusters,n_features,n_features),方差
phi的shape是(n_clusters,),混合系数
输出的omega是(n_samples,n_clusters)
import numpy as np
import matplotlib.pyplot as plt
# 计算高斯函数
def Gaussian(data,mean,cov):
dim = np.shape(cov)[0] # 计算维度
covdet = np.linalg.det(cov) # 计算|cov|
covinv = np.linalg.pinv(cov) # 计算cov的逆
if covdet==0: # 以防行列式为0
covdet = np.linalg.det(cov+np.eye(dim)*0.01)
covinv = np.linalg.pinv(cov+np.eye(dim)*0.01)
m = data - mean
z = -0.5 * np.dot(np.dot(m, covinv),m) # 计算exp()里的值
return 1.0/(np.power(np.power(2*np.pi,dim)*abs(covdet),0.5))*np.exp(z) # 返回概率密度值
def computeOmega(X,mu,sigma,phi,multiGaussian):
n_samples=X.shape[0]
n_clusters=len(phi)
omega=np.zeros((n_samples,n_clusters))
p=np.zeros(n_clusters)
g=np.zeros(n_clusters)
for i in range(n_samples):
for j in range(n_clusters):
p[j]=multiGaussian(X[i],mu[j],sigma[j])
g[j]=phi[j]*p[j]
for k in range(n_clusters):
omega[i,k]=g[k]/np.sum(g)
return omega
class MyGMM():
def __init__(self,n_clusters,ITER=50):
self.n_clusters=n_clusters
self.ITER=ITER
self.mu=0
self.sigma=0
self.phi=0
# 获取最初的聚类中心
def GetInitialMeans(self,data,criterion):
dim = data.shape[1] # 数据的维度
K=self.n_clusters
means = [[] for k in range(K)] # 存储均值
minmax=[]
for i in range(dim):
minmax.append(np.array([min(data[:,i]),max(data[:,i])])) # 存储每一维的最大最小值
minmax=np.array(minmax)
while True:
for i in range(K):
means[i]=[]
for j in range(dim):
means[i].append(np.random.random()*(minmax[j][1]-minmax[j][0])+minmax[j][0] ) #随机产生means
means[i]=np.array(means[i])
if self.isdistance(means,criterion):
break
return means
# 用于判断初始聚类簇中的means是否距离离得比较近
def isdistance(self,means,criterion=0.03):
K=len(means)
for i in range(K):
for j in range(i+1,K):
if criterion>np.linalg.norm(means[i]-means[j]):
return False
return True
def Kmeans(self,data):
N = data.shape[0] # 样本数量
dim = data.shape[1] # 样本维度
K=self.n_clusters
means = self.GetInitialMeans(data,0.03)
means_old = [np.zeros(dim) for k in range(K)]
# 收敛条件
while np.sum([np.linalg.norm(means_old[k] - means[k]) for k in range(K)]) > 0.0001:
means_old = copy.deepcopy(means)
numlog = [1] * K # 存储属于某类的个数
sumlog = [np.zeros(dim) for k in range(K)]
# E步
for i in range(N):
dislog = [np.linalg.norm(data[i]-means[k]) for k in range(K)]
tok = dislog.index(np.min(dislog))
numlog[tok]+=1 # 属于该类的样本数量加1
sumlog[tok]+=data[i] # 存储属于该类的样本取值
# M步
for k in range(K):
means[k]=1.0 / numlog[k] * sumlog[k]
return means
def fit(self,data):
n_samples=data.shape[0]
n_features=data.shape[1]
K=self.n_clusters
phi=np.ones(self.n_clusters)/self.n_clusters
#mu=data[np.random.choice(range(n_samples),self.n_clusters)]
mu=self.GetInitialMeans(data,0.03)
sigma=np.full((self.n_clusters,n_features,n_features),np.diag(np.full(n_features,0.1)))
loglikelyhood = 0
oldloglikelyhood = 1
while np.abs(loglikelyhood - oldloglikelyhood) > 0.00001:
oldloglikelyhood = loglikelyhood
#while self.ITER:
omega=computeOmega(data,mu,sigma,phi,Gaussian) #更新omega
phi=np.sum(omega,axis=0)/n_samples #更新phi
#下面是更新mu,sigma
for i in range(self.n_clusters):
mu[i]=np.sum(data*omega[:,i].reshape((n_samples,1)),axis=0)/np.sum(omega,axis=0)[i]
sigma[i]=0
for j in range(n_samples):
sigma[i]+=(data[j].reshape((1,n_features))-mu[i]).T.dot((data[j]-mu[i]).reshape((1,n_features)))*omega[j,i]
sigma[i]=sigma[i]/np.sum(omega,axis=0)[i]
# 计算最大似然函数
loglikelyhood = np.sum(
[np.log(np.sum([phi[k] * Gaussian(data[n], mu[k], sigma[k]) for k in range(K)])) for n in range(n_samples)])
self.mu=mu
self.sigma=sigma
self.phi=phi
def predict(self,data):
pred=computeOmega(data,self.mu,self.sigma,self.phi,Gaussian)
cluster_results=np.argmax(pred,axis=1)
return cluster_results
使用IRIS数据集,进行分类
import numpy as np
from sklearn.datasets import load_iris
d=load_iris()
features = d.data
feature_names = d.feature_names
target = d.target
target_names = d.target_names
data=features
model1=MyGMM(3,50)
model1.fit(data)
result=model1.predict(data)
result结果如下:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)
import numpy as np
# 预处理数据
def loadData(filename):
dataSet = []
fr = open(filename)
for line in fr.readlines():
curLine = line.strip().split(' ')
fltLine = list(map(float, curLine))
dataSet.append(fltLine)
return dataSet
# 计算高斯函数
def Gaussian(data,mean,cov):
dim = np.shape(cov)[0] # 计算维度
covdet = np.linalg.det(cov) # 计算|cov|
covinv = np.linalg.pinv(cov) # 计算cov的逆
if covdet==0: # 以防行列式为0
covdet = np.linalg.det(cov+np.eye(dim)*0.01)
covinv = np.linalg.pinv(cov+np.eye(dim)*0.01)
m = data - mean
z = -0.5 * np.dot(np.dot(m, covinv),m) # 计算exp()里的值
return 1.0/(np.power(np.power(2*np.pi,dim)*abs(covdet),0.5))*np.exp(z) # 返回概率密度值
# 获取最初的聚类中心
def GetInitialMeans(data,K,criterion):
dim = data.shape[1] # 数据的维度
means = [[] for k in range(K)] # 存储均值
minmax=[]
for i in range(dim):
minmax.append(np.array([min(data[:,i]),max(data[:,i])])) # 存储每一维的最大最小值
minmax=np.array(minmax)
while True:
for i in range(K):
means[i]=[]
for j in range(dim):
means[i].append(np.random.random()*(minmax[j][1]-minmax[j][0])+minmax[j][0] ) #随机产生means
means[i]=np.array(means[i])
if isdistance(means,criterion):
break
return means
# 用于判断初始聚类簇中的means是否距离离得比较近
def isdistance(means,criterion=0.03):
K=len(means)
for i in range(K):
for j in range(i+1,K):
if criterion>np.linalg.norm(means[i]-means[j]):
return False
return True
# K均值算法,估计大约几个样本属于一个GMM
import copy
def Kmeans(data,K):
N = data.shape[0] # 样本数量
dim = data.shape[1] # 样本维度
means = GetInitialMeans(data,K,0.03)
means_old = [np.zeros(dim) for k in range(K)]
# 收敛条件
while np.sum([np.linalg.norm(means_old[k] - means[k]) for k in range(K)]) > 0.0001:
means_old = copy.deepcopy(means)
numlog = [1] * K # 存储属于某类的个数
sumlog = [np.zeros(dim) for k in range(K)]
# E步
for i in range(N):
dislog = [np.linalg.norm(data[i]-means[k]) for k in range(K)]
tok = dislog.index(np.min(dislog))
numlog[tok]+=1 # 属于该类的样本数量加1
sumlog[tok]+=data[i] # 存储属于该类的样本取值
# M步
for k in range(K):
means[k]=1.0 / numlog[k] * sumlog[k]
return means
GMM主程序
def GMM(data,K,ITER):
N = data.shape[0]
dim = data.shape[1]
means= Kmeans(data,K)
#means=GetInitialMeans(data,K,0.03)
convs=[0]*K
# 初始方差等于整体data的方差
for i in range(K):
convs[i]=np.cov(data.T)
#convs=np.full((K,dim,dim),np.diag(np.full(dim,0.1)))
phi = [1.0/K] * K
omega = [np.zeros(K) for i in range(N)]
loglikelyhood = 0
oldloglikelyhood = 1
while np.abs(loglikelyhood - oldloglikelyhood) > 0.00001:
#print(np.abs(loglikelyhood - oldloglikelyhood))
#while ITER:
oldloglikelyhood = loglikelyhood
# E步
for i in range(N):
res = [phi[k] * Gaussian(data[i],means[k],convs[k]) for k in range(K)]
sumres = np.sum(res)
for k in range(K): # gamma表示第n个样本属于第k个混合高斯的概率
omega[i][k] = res[k] / sumres
# M步
for k in range(K):
Nk = np.sum([omega[n][k] for n in range(N)]) # N[k] 表示N个样本中有多少属于第k个高斯
phi[k] = 1.0 * Nk/N
means[k] = (1.0/Nk)*np.sum([omega[n][k] * data[n] for n in range(N)],axis=0)
xdiffs = data - means[k]
convs[k] = (1.0/ Nk)*np.sum([omega[n][k]* xdiffs[n].reshape(dim,1) * xdiffs[n] for n in range(N)],axis=0)
# 计算最大似然函数
loglikelyhood = np.sum(
[np.log(np.sum([phi[k] * Gaussian(data[n], means[k], convs[k]) for k in range(K)])) for n in range(N)])
ITER-=1
#print(oldloglikelyhood,loglikelyhood)
return phi,means,convs
加载iris数据集
# dataSet=loadData('d:/watermelon4.txt')
# means=GetInitialMeans(np.array(dataSet),3,0.03)
import numpy as np
from sklearn.datasets import load_iris
d=load_iris()
features = d.data
feature_names = d.feature_names
target = d.target
target_names = d.target_names
data=features
对数据预测:
def computeOmega(X,mu,sigma,phi,multiGaussian):
n_samples=X.shape[0]
n_clusters=len(phi)
gamma=np.zeros((n_samples,n_clusters))
p=np.zeros(n_clusters)
g=np.zeros(n_clusters)
for i in range(n_samples):
for j in range(n_clusters):
p[j]=multiGaussian(X[i],mu[j],sigma[j])
g[j]=phi[j]*p[j]
for k in range(n_clusters):
gamma[i,k]=g[k]/np.sum(g)
return gamma
def predict(data,p,m,c):
pred=computeOmega(data,m,c,p,Gaussian)
cluster_results=np.argmax(pred,axis=1)
return cluster_results
p,m,c=GMM(data,3,50)
predict(data,p,m,c)
结果如下:
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)