Public functions — imputena documentationhttps://imputena.readthedocs.io/en/latest/functions.html#multiple-imputation-by-chained-equations
https://github.com/macarro/imputena/blob/master/test/multiple_imputation/test_mice.pyhttps://github.com/macarro/imputena/blob/master/test/multiple_imputation/test_mice.py
import logging
import unittest
import numpy as np
import pandas as pd
from imputena import mice
def generate_df_breast_cancer():
"""
Example dataframe used to test logistic regression.
Adapted from: Breast Cancer Wisconsin (Diagnostic) Data Set (UCI Machine
Learning Repository)
Contains 15 missing values.
thickness uniformity adhesion size nucleoli mitoses class
0 1.0 1.0 1 1.0 1 1 B
1 1.0 1.0 1 2.0 1 1 B
2 8.0 4.0 3 3.0 3 1 B
3 4.0 1.0 1 2.0 6 1 B
4 10.0 8.0 4 4.0 10 4 None
5 5.0 1.0 1 2.0 2 1 B
6 NaN 10.0 10 3.0 6 1 None
7 3.0 3.0 1 2.0 1 1 B
8 3.0 NaN 1 2.0 1 1 None
9 2.0 3.0 1 5.0 1 1 B
10 NaN 1.0 1 NaN 1 1 None
11 5.0 2.0 2 1.0 1 1 B
12 10.0 NaN 2 NaN 7 1 None
13 7.0 8.0 2 4.0 8 2 M
14 8.0 4.0 1 3.0 9 2 None
15 1.0 1.0 1 2.0 1 1 B
16 4.0 1.0 1 2.0 1 1 None
17 1.0 2.0 1 2.0 1 1 B
18 10.0 NaN 4 NaN 10 1 M
19 1.0 1.0 1 2.0 1 1 B
20 3.0 1.0 1 2.0 1 1 B
21 5.0 1.0 1 2.0 1 1 B
22 4.0 1.0 1 2.0 1 1 B
23 8.0 4.0 1 2.0 3 1 M
24 8.0 7.0 4 5.0 10 1 M
25 10.0 4.0 10 4.0 1 1 M
26 8.0 3.0 9 3.0 3 1 M
27 8.0 10.0 8 7.0 7 1 M
28 6.0 1.0 1 2.0 1 1 B
29 4.0 1.0 1 2.0 1 1 B
"""
return pd.DataFrame({
'thickness': np.array(
[1.0, 1.0, 8.0, 4.0, 10.0, 5.0, np.nan, 3.0, 3.0, 2.0, np.nan, 5.0,
10.0, 7.0, 8.0, 1.0, 4.0, 1.0, 10.0, 1.0, 3.0, 5.0, 4.0, 8.0, 8.0,
10.0, 8.0, 8.0, 6.0, 4.0]),
'uniformity': np.array(
[1.0, 1.0, 4.0, 1.0, 8.0, 1.0, 10.0, 3.0, np.nan, 3.0, 1.0, 2.0,
np.nan, 8.0, 4.0, 1.0, 1.0, 2.0, np.nan, 1.0, 1.0, 1.0, 1.0, 4.0,
7.0, 4.0, 3.0, 10.0, 1.0, 1.0]),
'adhesion': np.array(
[1, 1, 3, 1, 4, 1, 10, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 4, 1, 1, 1,
1, 1, 4, 10, 9, 8, 1, 1]),
'size': np.array(
[1.0, 2.0, 3.0, 2.0, 4.0, 2.0, 3.0, 2.0, 2.0, 5.0, np.nan, 1.0,
np.nan, 4.0, 3.0, 2.0, 2.0, 2.0, np. nan, 2.0, 2.0, 2.0, 2.0, 2.0,
5.0, 4.0, 3.0, 7.0, 2.0, 2.0]),
'nucleoli': np.array(
[1, 1, 3, 6, 10, 2, 6, 1, 1, 1, 1, 1, 7, 8, 9, 1, 1, 1, 10, 1, 1,
1, 1, 3, 10, 1, 3, 7, 1, 1]),
'mitoses': np.array(
[1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1]),
'class': ['B', 'B', 'B', 'B', None, 'B', None, 'B', None, 'B', None,
'B', None, 'M', None, 'B', None, 'B', 'M', 'B', 'B', 'B',
'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B']
})
# df = generate_df_breast_cancer()
df = pd.read_csv(r"F:\Latex_IDA\example_incomplete.csv",header=None,index_col=None)
# 下面这句代码必须要有
df.columns = ["s"+str(i) for i in df.columns]
# print(df)
#
dfs = mice(df,imputations=3)
# print(dfs[0])
# print(dfs[0].isna().sum().sum())
#
#
# print(dfs[1])
# print(dfs[1].isna().sum().sum())
#
# print(dfs[2])
# print(dfs[2].isna().sum().sum())
data_0 = dfs[0]
data_1 = dfs[1]
data_2 = dfs[2]
data_mean = (data_0 + data_1 + data_2)/3
print(data_mean)
非调包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split
class MiceImputer(object):
def __init__(self, seed_values = True, seed_strategy="mean", copy=True):
self.strategy = seed_strategy # seed_strategy in ['mean','median','most_frequent', 'constant']
self.seed_values = seed_values # seed_values = False initializes missing_values using not_null columns
self.copy = copy
self.imp = SimpleImputer(strategy=self.strategy, copy=self.copy)
def fit_transform(self, X, method = 'Linear', iter = 5, verbose = True):
# Why use Pandas?
# http://gouthamanbalaraman.com/blog/numpy-vs-pandas-comparison.html
# Pandas < Numpy if X.shape[0] < 50K
# Pandas > Numpy if X.shape[0] > 500K
# Data necessary for masking missing-values after imputation
null_cols = X.columns[X.isna().any()].tolist()
null_X = X.isna()[null_cols]
### Initialize missing_values
if self.seed_values:
# Impute all missing values using SimpleImputer
if verbose:
print('Initilization of missing-values using SimpleImputer')
new_X = pd.DataFrame(self.imp.fit_transform(X))
new_X.columns = X.columns
new_X.index = X.index
else:
# Initialize a copy based on value of self.copy
if self.copy:
new_X = X.copy()
else:
new_X = X
not_null_cols = X.columns[X.notna().any()].tolist()
if verbose:
print('Initilization of missing-values using regression on non-null columns')
for column in null_cols:
null_rows = null_X[column]
train_x = new_X.loc[~null_rows, not_null_cols]
test_x = new_X.loc[null_rows, not_null_cols]
train_y = new_X.loc[~null_rows, column]
if X[column].nunique() > 2:
m = LinearRegression(n_jobs = -1)
m.fit(train_x, train_y)
new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
not_null_cols.append(column)
elif X[column].nunique() == 2:
m = LogisticRegression(n_jobs = -1, solver = 'lbfgs')
m.fit(train_x, train_y)
new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
not_null_cols.append(column)
### Begin iterations of MICE
model_score = {}
for i in range(iter):
if verbose:
print('Beginning iteration ' + str(i) + ':')
model_score[i] = []
for column in null_cols:
null_rows = null_X[column]
not_null_y = new_X.loc[~null_rows, column]
not_null_X = new_X[~null_rows].drop(column, axis = 1)
train_x, val_x, train_y, val_y = train_test_split(not_null_X, not_null_y, test_size=0.33, random_state=42)
test_x = new_X.drop(column, axis = 1)
if new_X[column].nunique() > 2:
if method == 'Linear':
m = LinearRegression(n_jobs = -1)
elif method == 'Ridge':
m = Ridge()
m.fit(train_x, train_y)
model_score[i].append(m.score(val_x, val_y))
new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
if verbose:
print('Model score for ' + str(column) + ': ' + str(m.score(val_x, val_y)))
elif new_X[column].nunique() == 2:
if method == 'Linear':
m = LogisticRegression(n_jobs = -1, solver = 'lbfgs')
elif method == 'Ridge':
m = RidgeClassifier()
m.fit(train_x, train_y)
model_score[i].append(m.score(val_x, val_y))
new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
if verbose:
print('Model score for ' + str(column) + ': ' + str(m.score(val_x, val_y)))
if model_score[i] == []:
model_score[i] = 0
else:
model_score[i] = sum(model_score[i])/len(model_score[i])
return new_X
df = pd.read_csv(r"F:\Latex_IDA\example_incomplete.csv",header=None,index_col=None)
df.columns = ["s"+str(i) for i in df.columns]
mice = MiceImputer()
Z = mice.fit_transform(df)
print(Z)
data_complete = np.array(pd.read_csv(r"F:\Latex_IDA\example_orginal.csv",header=None,index_col=None))
y = data_complete[:,-1]
n1 = np.linspace(0,5,1000)
m1 = np.sqrt(25 - n1**2)
n2 = np.linspace(0,9,1000)
m2 = np.sqrt(81 - n2**2)
plt.figure(figsize=(6,6))
plt.scatter(Z["s0"],Z["s1"],c=y,edgecolors='k',linewidths=0.5)
plt.plot(n1,m1,ls='--',lw=1,color='k')
plt.plot(n2,m2,ls='--',lw=1,color='k')
# plt.savefig(r"F:\Latex_IDA\example_ICkNNI.pdf",dpi=400,bbox_inches='tight')
plt.show()
# print(df)
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from colorama import Fore, Style, init
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
df = pd.read_csv(r"F:\Latex_IDA\example_incomplete.csv",header=None,index_col=None)
df.columns = ["s"+str(i) for i in df.columns]
imp = IterativeImputer(estimator=LinearRegression(),
initial_strategy="mean",
max_iter=10,
tol=1e-10,
random_state=0,)
Z = imp.fit_transform(df)
print(Z)
data_complete = np.array(pd.read_csv(r"F:\Latex_IDA\example_orginal.csv",header=None,index_col=None))
y = data_complete[:,-1]
n1 = np.linspace(0,5,1000)
m1 = np.sqrt(25 - n1**2)
n2 = np.linspace(0,9,1000)
m2 = np.sqrt(81 - n2**2)
plt.figure(figsize=(6,6))
plt.scatter(Z[:,0],Z[:,1],c=y,edgecolors='k',linewidths=0.5)
plt.plot(n1,m1,ls='--',lw=1,color='k')
plt.plot(n2,m2,ls='--',lw=1,color='k')
plt.show()
显然这样的填充看起来很不地道!