"""
Daniel He
at CQUPT
"""
import os
import xlrd
import xlwt
import copy
from time import time
from copy import deepcopy
import numpy as np
import pandas as pd
from scipy.linalg import svd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, mutual_info_score
class mc_IRLS():
def __init__(self, X, mask):
self.X = X
self.nSample, self.nDim = self.X.shape
self.mask = mask
self.un_mask = 1 - mask
self.W = np.ones((self.nSample, self.nDim))
self.max_inner_iter = 100
self.max_outer_iter = 100
self.tol = 1e-6
self.epsilon = 1.0
self.gamma = 0.5
self.X_obr = self.X * self.mask
self.X_mc = None
def svd_perturbation(self, X):
U, S, Vt = svd(X, full_matrices=False)
S_perturbation = np.sqrt(S**2, self.epsilon**2)
A = U.dot(np.diag(S_perturbation)).dot(Vt)
return A
def imputation(self):
X_t = self.X_obr
count = 0
for i in range(self.max_outer_iter):
count += 1
print("count=",count)
# --------------------求步长s
# s = 0.5 / np.linalg.norm(self.W, ord='fro')
s = 0.001
#---------------------梯度下降
X_t_old = deepcopy(X_t)
for j in range(self.max_inner_iter):
# print("s=",s)
X_t = (X_t - s * X_t) * self.un_mask + self.X_obr
self.X_mc = X_t
if np.sum(abs(X_t - X_t_old)) < self.tol:
print("=======================断开===")
break
X_t_old = deepcopy(X_t)
# --------------------更新权重W
U, S, Vt = svd(X_t, full_matrices=False)
# print("S",S)
self.epsilon = np.min([self.epsilon, self.gamma * S[self.nDim-1]])
# --------------------获取扰动奇异值矩阵
S_perturbation = np.sqrt(S**2+ self.epsilon**2)
X_t_perturbation = U.dot(np.diag(S_perturbation)).dot(Vt)
self.W = np.sqrt(np.linalg.inv(X_t_perturbation @ X_t_perturbation.T))
if self.epsilon == 0:
print("count=",count)
break
if __name__ == '__main__':
name_list = ["Abalone-5bin","Abalone-10bin","ARWU2020-5bin","ARWU2020-10bin",
"Automobile","Automobile1","Balance-scale","Bank-5bin","Bank-10bin",
"California-10bin","Car","Cleveland","Connect",
"Computer-5bin","Computer-10bin","CTGs","Eucalyptus",
"Glass","Housing-5bin","Housing-10bin","Knowledge",
"Marketing","Melanoma","Newthyroid","Nursery",
"Obesity1","Obesity2","PowerPlant-10bin","SWD",
"Stock-5bin","Stock-10bin","Thyroid1","Thyroid2",
"Toy","Winequality-red"]
name_list = ["Stock-5bin",]
data_path = os.path.join(os.path.dirname(__file__),r"D:\Chapter5\DataSet")
patition_path = os.path.join(os.path.dirname(__file__),r"D:\Chapter5\PartitionMissing\20")
for name in name_list:
read_data_path = os.path.join(data_path, name+".csv")
data = np.array(pd.read_csv(read_data_path, header=None))
scaler = StandardScaler()
X = np.asarray(data[:,:-1])
X = scaler.fit_transform(X)
y = data[:,-1].astype(int)
y -= y.min()
nSample, nDim = X.shape
'''read the data partition and mask information'''
index_path = os.path.join(patition_path, name + "-index.xls")
mask_path = os.path.join(patition_path, name + "-mask.xls")
book_index = xlrd.open_workbook(index_path)
book_mask = xlrd.open_workbook(mask_path)
for sn in book_index.sheet_names():
print("================{}".format(sn))
S_Time = time()
table_index = book_index.sheet_by_name(sn)
table_mask = book_mask.sheet_by_name(sn)
train_ids = []
test_ids = []
# labeled_index = []
for idx in table_index.col_values(0):
if isinstance(idx, float):
train_ids.append(int(idx))
for idx in table_index.col_values(1):
if isinstance(idx, float):
test_ids.append(int(idx))
mask = np.zeros((len(train_ids),nDim))
for i in range(len(train_ids)):
for j in range(nDim):
mask[i,j] = table_mask.cell_value(i,j)
model = mc_IRLS(X[train_ids], mask)
model.imputation()
X_mc = model.X_mc
mse = np.linalg.norm(X_mc - X[train_ids], ord='fro')
print("重构误差::",mse)