EM:expectation maximization

import xlwt
import xlrd
import os
import numpy as np
import pandas as pd
from collections import OrderedDict
from pathlib import Path
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn.metrics.pairwise import pairwise_distances
from numpy.linalg import inv
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score, mutual_info_score
from sklearn.neighbors import NearestNeighbors
from numpy.linalg import LinAlgError
from scipy.spatial.distance import pdist, squareform
from scipy.special import gammaln
# from ycimpute import EM
import copy
np.seterr(divide='ignore',invalid='ignore')


class EM():
    def __init__(self,max_iter=100,theta=1e-5):
        self.max_iter = max_iter
        self.theta = theta
    def _init_parameters(self, X):
        rows, cols = X.shape
        mu_init = np.nanmean(X, axis=0)
        sigma_init = np.zeros((cols, cols))
        for i in range(cols):
            for j in range(i, cols):
                vec_col = X[:, [i, j]]
                vec_col = vec_col[~np.any(np.isnan(vec_col), axis=1), :].T
                if len(vec_col) > 0:
                    cov = np.cov(vec_col)
                    cov = cov[0, 1]
                    sigma_init[i, j] = cov
                    sigma_init[j, i] = cov

                else:
                    sigma_init[i, j] = 1.0
                    sigma_init[j, i] = 1.0

        return mu_init, sigma_init

    def _e_step(self, mu,sigma, X):
        samples,_ = X.shape
        for sample in range(samples):
            if np.any(np.isnan(X[sample,:])):
                loc_nan = np.isnan(X[sample,:])
                new_mu = np.dot(sigma[loc_nan, :][:, ~loc_nan],
                                np.dot(np.linalg.inv(sigma[~loc_nan, :][:, ~loc_nan]),
                                       (X[sample, ~loc_nan] - mu[~loc_nan])[:,np.newaxis]))
                nan_count = np.sum(loc_nan)
                X[sample, loc_nan] = mu[loc_nan] + new_mu.reshape(1,nan_count)

        return X

    def _m_step(self,X):
        rows, cols = X.shape
        mu = np.mean(X, axis=0)
        sigma = np.cov(X.T)
        tmp_theta = -0.5 * rows * (cols * np.log(2 * np.pi) +
                                   np.log(np.linalg.det(sigma)))

        return mu, sigma,tmp_theta

    def solve(self, X, missing_mask):
        mu, sigma = self._init_parameters(X)
        complete_X,updated_X = None, None
        rows,_ = X.shape
        theta = -np.inf
        for iter in range(self.max_iter):
            updated_X = self._e_step(mu=mu, sigma=sigma, X=copy.copy(X))
            mu, sigma, tmp_theta = self._m_step(updated_X)
            for i in range(rows):
                tmp_theta -= 0.5 * np.dot((updated_X[i, :] - mu),
                                          np.dot(np.linalg.inv(sigma), (updated_X[i, :] - mu)[:, np.newaxis]))
            if abs(tmp_theta-theta)

你可能感兴趣的:(python,numpy,开发语言)