Python:IRLS-based Matrix Completion

"""
Daniel He
at CQUPT
"""
import os
import xlrd
import xlwt
import copy
from time import time
from copy import deepcopy
import numpy as np
import pandas as pd
from scipy.linalg import svd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, mutual_info_score

class mc_IRLS():
    def __init__(self, X, mask):
        self.X = X
        self.nSample, self.nDim = self.X.shape
        self.mask = mask
        self.un_mask = 1 - mask
        self.W = np.ones((self.nSample, self.nDim))
        self.max_inner_iter = 100
        self.max_outer_iter = 100
        self.tol = 1e-6
        self.epsilon = 1.0
        self.gamma = 0.5
        self.X_obr = self.X * self.mask
        self.X_mc = None


    def svd_perturbation(self, X):
        U, S, Vt = svd(X, full_matrices=False)
        S_perturbation = np.sqrt(S**2, self.epsilon**2)
        A = U.dot(np.diag(S_perturbation)).dot(Vt)
        return A





    def imputation(self):
        X_t = self.X_obr
        count = 0
        for i in range(self.max_outer_iter):
            count += 1
            print("count=",count)
            # --------------------求步长s
            # s = 0.5 / np.linalg.norm(self.W, ord='fro')
            s = 0.001
            #---------------------梯度下降
            X_t_old = deepcopy(X_t)
            for j in range(self.max_inner_iter):
                # print("s=",s)
                X_t = (X_t - s * X_t) * self.un_mask + self.X_obr
                self.X_mc = X_t
                if np.sum(abs(X_t - X_t_old)) < self.tol:
                    print("=======================断开===")
                    break
                X_t_old = deepcopy(X_t)
            # --------------------更新权重W
            U, S, Vt = svd(X_t, full_matrices=False)
            # print("S",S)

            self.epsilon = np.min([self.epsilon, self.gamma * S[self.nDim-1]])
            # --------------------获取扰动奇异值矩阵
            S_perturbation = np.sqrt(S**2+ self.epsilon**2)
            X_t_perturbation = U.dot(np.diag(S_perturbation)).dot(Vt)
            self.W = np.sqrt(np.linalg.inv(X_t_perturbation @ X_t_perturbation.T))
            if self.epsilon == 0:
                print("count=",count)
                break


if __name__ == '__main__':

    name_list = ["Abalone-5bin","Abalone-10bin","ARWU2020-5bin","ARWU2020-10bin",
                 "Automobile","Automobile1","Balance-scale","Bank-5bin","Bank-10bin",
                 "California-10bin","Car","Cleveland","Connect",
                 "Computer-5bin","Computer-10bin","CTGs","Eucalyptus",
                 "Glass","Housing-5bin","Housing-10bin","Knowledge",
                 "Marketing","Melanoma","Newthyroid","Nursery",
                 "Obesity1","Obesity2","PowerPlant-10bin","SWD",
                 "Stock-5bin","Stock-10bin","Thyroid1","Thyroid2",
                 "Toy","Winequality-red"]

    name_list = ["Stock-5bin",]

    data_path = os.path.join(os.path.dirname(__file__),r"D:\Chapter5\DataSet")
    patition_path = os.path.join(os.path.dirname(__file__),r"D:\Chapter5\PartitionMissing\20")
    for name in name_list:
        read_data_path = os.path.join(data_path, name+".csv")
        data = np.array(pd.read_csv(read_data_path, header=None))
        scaler = StandardScaler()
        X = np.asarray(data[:,:-1])
        X = scaler.fit_transform(X)
        y = data[:,-1].astype(int)
        y -= y.min()
        nSample, nDim = X.shape

        '''read the data partition and mask information'''
        index_path = os.path.join(patition_path, name + "-index.xls")
        mask_path = os.path.join(patition_path, name + "-mask.xls")
        book_index = xlrd.open_workbook(index_path)
        book_mask = xlrd.open_workbook(mask_path)

        for sn in book_index.sheet_names():
            print("================{}".format(sn))
            S_Time = time()
            table_index = book_index.sheet_by_name(sn)
            table_mask = book_mask.sheet_by_name(sn)

            train_ids = []
            test_ids = []
            # labeled_index = []
            for idx in table_index.col_values(0):
                if isinstance(idx, float):
                    train_ids.append(int(idx))
            for idx in table_index.col_values(1):
                if isinstance(idx, float):
                    test_ids.append(int(idx))


            mask = np.zeros((len(train_ids),nDim))
            for i in range(len(train_ids)):
                for j in range(nDim):
                    mask[i,j] = table_mask.cell_value(i,j)

            model = mc_IRLS(X[train_ids], mask)
            model.imputation()
            X_mc = model.X_mc
            mse = np.linalg.norm(X_mc - X[train_ids], ord='fro')
            print("重构误差::",mse)


你可能感兴趣的:(python)