import pandas as pd
train_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛训练集.csv', encoding='gbk')
test_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛测试集.csv', encoding='gbk')
print(train_df.shape, test_df.shape)
print(train_df.dtypes, test_df.dtypes)
def do_nan(train_df,test_df,method = None):
#口服耐糖量测试为-1,也应该为null
train_df['口服耐糖量测试'] = train_df['口服耐糖量测试'].replace(-1, np.nan)
test_df['口服耐糖量测试'] = test_df['口服耐糖量测试'].replace(-1, np.nan)
train_df['口服耐糖量测试'] = train_df['口服耐糖量测试'].replace(0, np.nan)
test_df['口服耐糖量测试'] = test_df['口服耐糖量测试'].replace(0, np.nan)
if method is not None:
train_df['胰岛素释放实验'] = train_df['胰岛素释放实验'].replace(0, np.nan)
test_df['胰岛素释放实验'] = test_df['胰岛素释放实验'].replace(0, np.nan)
train_df['肱三头肌皮褶厚度'] = train_df['肱三头肌皮褶厚度'].replace(0, np.nan)
test_df['肱三头肌皮褶厚度'] = test_df['肱三头肌皮褶厚度'].replace(0, np.nan)
train_df['体重指数'] = train_df['体重指数'].replace(0, np.nan)
test_df['体重指数'] = test_df['体重指数'].replace(0, np.nan)
# 训练集缺失值情况
print(train_df.isnull().mean(0))
# 训练集缺失值情况
print(test_df.isnull().mean(0))
#将糖尿病家族史转换为类别变量
dict_糖尿病家族史 = {
'无记录': 0,
'叔叔或姑姑有一方患有糖尿病': 1,
'叔叔或者姑姑有一方患有糖尿病': 1,
'父母有一方患有糖尿病': 2
}
train_df['糖尿病家族史'] = train_df['糖尿病家族史'].map(dict_糖尿病家族史)
test_df['糖尿病家族史'] = test_df['糖尿病家族史'].map(dict_糖尿病家族史)
#出生年份转换
train_df['年龄'] = 2022 - train_df['出生年份']
test_df['年龄'] = 2022 - test_df['出生年份']
#填充缺失值处理
#下列三个字段比例较低,可以直接填充
train_df['舒张压'].fillna(89, inplace=True)
test_df['舒张压'].fillna(89, inplace=True)
train_df['口服耐糖量测试'].fillna(6,inplace = True)
test_df['口服耐糖量测试'].fillna(6,inplace = True)
train_df['体重指数'].fillna(38,inplace = True)
test_df['体重指数'].fillna(38,inplace = True)
#胰岛素释放实验 肱三头肌皮褶厚度 字段缺失比例较高,试试统计填充(多元线性回归)
all_df = pd.concat([train_df,test_df])
all_df.drop(['患有糖尿病标识'],axis=1,inplace=True)
all_df.dropna(inplace=True)
if method == 'lg':
lg_g3 = LinearRegression()
lg_yd = LinearRegression()
lg_features = [f for f in all_df.columns if f not in ['编号','肱三头肌皮褶厚度','胰岛素释放实验']]
lg_g3.fit(all_df[lg_features],all_df['肱三头肌皮褶厚度'])
lg_yd.fit(all_df[lg_features],all_df['胰岛素释放实验'])
for index,bool in enumerate (train_df['胰岛素释放实验'].isnull()):
if bool:
train_df.loc[index,'胰岛素释放实验'] = lg_yd.predict(np.array(train_df.loc[index,lg_features]).reshape(1, -1))
for index,bool in enumerate (train_df['肱三头肌皮褶厚度'].isnull()):
if bool:
train_df.loc[index,'肱三头肌皮褶厚度'] = lg_g3.predict(np.array(train_df.loc[index,lg_features]).reshape(1, -1))
for index,bool in enumerate (test_df['胰岛素释放实验'].isnull()):
if bool:
test_df.loc[index,'胰岛素释放实验'] = lg_yd.predict(np.array(test_df.loc[index,lg_features]).reshape(1, -1))
for index,bool in enumerate (test_df['肱三头肌皮褶厚度'].isnull()):
if bool:
test_df.loc[index,'肱三头肌皮褶厚度'] = lg_g3.predict(np.array(test_df.loc[index,lg_features]).reshape(1, -1))
#试试knn拟合
if method == 'knn':
train_df = pd.DataFrame(KNN(k=5).fit_transform(train_df),columns= train_df.columns)
test_df = pd.DataFrame(KNN(k=5).fit_transform(test_df),columns= test_df.columns)
return train_df,test_df
测试了将胰岛素释放实验和肱三头肌皮褶厚度为0认为缺失,用KNN和线性拟合,模型结果不好,所以最终还是只用平均值填充了 口服耐糖量、舒张压和体重指数
后面再想想辙吧,这个就搞了两三个小时。
#将糖尿病家族史转换为类别变量
dict_糖尿病家族史 = {
'无记录': 0,
'叔叔或姑姑有一方患有糖尿病': 1,
'叔叔或者姑姑有一方患有糖尿病': 1,
'父母有一方患有糖尿病': 2
}
train_df['糖尿病家族史'] = train_df['糖尿病家族史'].map(dict_糖尿病家族史)
test_df['糖尿病家族史'] = test_df['糖尿病家族史'].map(dict_糖尿病家族史)
#出生年份转换
train_df['年龄'] = 2022 - train_df['出生年份']
test_df['年龄'] = 2022 - test_df['出生年份']
train_df.corr().loc['患有糖尿病标识',:]
for column in train_df.columns:
if column == '编号':
continue
if column == '患有糖尿病标识':
continue
plt.figure()
for i in [0,1]:
train_df[column][train_df['患有糖尿病标识']==i].hist()
plt.title("{}".format(column))
plt.savefig("fig/{}.png".format(column))
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
#填充空值
train_df['舒张压'].fillna(89.42, inplace=True)
test_df['舒张压'].fillna(89.42, inplace=True)
train_df['口服耐糖量测试'].fillna(5.95,inplace = True)
test_df['口服耐糖量测试'].fillna(5.95,inplace = True)
#划分训练集、验证集
features = [f for f in train_df.columns if f not in ['编号','出生年份','患有糖尿病标识']]
x_train, x_test, y_train, y_test = train_test_split(train_df[features],train_df['患有糖尿病标识'],test_size=0.2, random_state=2022)
#构建模型
lr_l1 = make_pipeline(
MinMaxScaler(),
LogisticRegression(penalty="l1", C=1, solver="liblinear")
)
lr_l2 = make_pipeline(
MinMaxScaler(),
LogisticRegression(penalty="l2", C=1, solver="liblinear")
)
# 训练集表现
l1_train_predict = []
l2_train_predict = []
# 测试集表现
l1_test_predict = []
l2_test_predict = []
for c in np.linspace(0.01, 10, 100) :
lr_l1 = LogisticRegression(penalty="l1", C=c, solver="liblinear", max_iter=2000)
lr_l2 = LogisticRegression(penalty='l2', C=c, solver='liblinear', max_iter=2000)
# 训练模型,记录L1正则化模型在训练集测试集上的表现
lr_l1.fit(x_train, y_train)
l1_train_predict.append(accuracy_score(lr_l1.predict(x_train), y_train))
l1_test_predict.append(accuracy_score(lr_l1.predict(x_test), y_test))
# 记录L2正则化模型的表现
lr_l2.fit(x_train, y_train)
l2_train_predict.append(accuracy_score(lr_l2.predict(x_train), y_train))
l2_test_predict.append(accuracy_score(lr_l2.predict(x_test), y_test))
data = [l1_train_predict, l2_train_predict, l1_test_predict, l2_test_predict]
label = ['l1_train', 'l2_train', 'l1_test', "l2_test"]
color = ['red', 'green', 'orange', 'blue']
plt.figure(figsize=(12, 6))
for i in range(4) :
plt.plot(np.linspace(0.01, 10, 100), data[i], label=label[i], color=color[i])
plt.legend(loc="best")
plt.show()
#取出最优参数,进行训练、预测
best_model = LogisticRegression(penalty="l1", C=0.01, solver="liblinear", max_iter=2000)
best_model.fit(train_df[features],train_df['患有糖尿病标识'])
test_df['label'] = best_model.predict(test_df.drop(['编号'], axis=1))
test_df.rename({'编号': 'uuid'}, axis=1)[['uuid', 'label']].to_csv('submit.csv', index=None)
基于训练集验证集的准确率:82.05%
df_sex = train_df[['体重指数','舒张压']].groupby(train_df['性别']).mean()
train_df['体重指数_diff'] = [train_df.loc[i,'体重指数'] - df_sex.loc[train_df.loc[i,'性别'],'体重指数'] for i in range(len(train_df))]
train_df['舒张压_diff'] = [train_df.loc[i,'舒张压'] - df_sex.loc[train_df.loc[i,'性别'],'舒张压'] for i in range(len(train_df))]
df_family = train_df[['口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']].groupby(train_df['糖尿病家族史']).mean()
for column in ['口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']:
train_df[column +'_diff'] = [train_df.loc[i,column] - df_family.loc[train_df.loc[i,'糖尿病家族史'],column] for i in range(len(train_df))]
# 构建模型
model = make_pipeline(
MinMaxScaler(),
DecisionTreeClassifier()
)
#拟合结果
model.fit(x_train,y_train)
#查看特征重要度
for index, importance in enumerate(model.named_steps['decisiontreeclassifier'].feature_importances_):
print(index, importance)
['体重指数','舒张压','口服耐糖量测试', '胰岛素释放实验','肱三头肌皮褶厚度',]
10个指标重要度:[0.0030843 , 0.31595439, 0.00559351, 0.11861977, 0.16319968, 0.08296783, 0.25307058, 0.03212695, 0.01597175, 0.00941124]
没有明显提高:模型的达到瓶颈,不能接受更复杂的参数特征?
#构建模型
clf = lgb.LGBMClassifier(
max_depth=3,
n_estimators=4000,
n_jobs=-1,
verbose=-1,
verbosity=-1,
learning_rate=0.1,
)
#模型训练
clf.fit(x_train,y_train)
print(accuracy_score(y_test,clf.predict(x_test)))
test_df['label'] = clf.predict(test_df[features])
test_df.rename({'编号': 'uuid'}, axis=1)[['uuid', 'label']].to_csv('submit.csv', index=None)
# 模型交叉验证
def run_model_cv(model, kf, X_tr, y, X_te, cate_col=None):
train_pred = np.zeros( (len(X_tr), len(np.unique(y))) )
test_pred = np.zeros( (len(X_te), len(np.unique(y))) )
cv_clf = []
for tr_idx, val_idx in kf.split(X_tr, y):
x_tr = X_tr.iloc[tr_idx]; y_tr = y.iloc[tr_idx]
x_val = X_tr.iloc[val_idx]; y_val = y.iloc[val_idx]
call_back = [
lgb.early_stopping(50),
]
eval_set = [(x_val, y_val)]
model.fit(x_tr, y_tr, eval_set=eval_set, callbacks=call_back, verbose=-1)
cv_clf.append(model)
train_pred[val_idx] = model.predict_proba(x_val)
test_pred += model.predict_proba(X_te)
test_pred /= kf.n_splits
return train_pred, test_pred, cv_clf
#定义模型
clf = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='binary_logloss',learning_rate=0.01, n_estimators=94, max_depth=3, num_leaves=10,max_bin=175,min_data_in_leaf=101,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8,
lambda_l1=0.7,lambda_l2=0.7,min_split_gain=0.1,num_iterations =10000)
#训练KFold
train_pred, test_pred, cv_clf = run_model_cv(
clf, KFold(n_splits=5),
x_train,
y_train,
x_test)
test_pred_1 = [ i.argmax() for i in test_pred]
accuracy_score(test_pred_1,y_test)
#训练StratifiedKfold
train_pred, test_pred, cv_clf = run_model_cv(
clf, StratifiedKFold(n_splits=5),
x_train,
y_train,
x_test)
test_pred_2 = [ i.argmax() for i in test_pred]
accuracy_score(test_pred_2,y_test)
使用了KFold 和 StratifiedKFold,尝试3-5折,发现4折StratifiedKFold效果最好,在验证集上为95.85%,好像也没有优于单个的gbm模型。
试一下stacking:
#使用random forest、extratrees、adaboost、gradientboosting、svm作为第一层分类器
#第二层lgb
train_df = pd.read_csv('data/比赛训练集.csv', encoding='gbk')
test_df = pd.read_csv('data/比赛测试集.csv', encoding='gbk')
train_df,test_df = do_nan(train_df,test_df)
train_df,test_df = do_feature(train_df,test_df,False,False)
#划分训练集、验证集
features = [f for f in train_df.columns if f not in ['编号','出生年份','患有糖尿病标识']]
x_train, x_test, y_train, y_test = train_test_split(train_df[features],train_df['患有糖尿病标识'],test_size=0.2, random_state=2022)
# Class to extend the Sklearn classifier
class SklearnHelper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
#底层模型交叉训练oof
def get_oof(model, kf, X_tr, y, X_te, cate_col=None):
train_pred = np.zeros( (len(X_tr), ))
test_pred = np.zeros( (len(X_te), ))
cv_clf = []
for tr_idx, val_idx in kf.split(X_tr, y):
x_tr = X_tr.iloc[tr_idx]; y_tr = y.iloc[tr_idx]
x_val = X_tr.iloc[val_idx]; y_val = y.iloc[val_idx]
model.fit(x_tr, y_tr)
cv_clf.append(model)
train_pred[val_idx] = model.predict(x_val)
test_pred += model.predict(X_te)
test_pred /= kf.n_splits
return train_pred, test_pred
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
'n_jobs': -1,
'n_estimators': 150,
'warm_start': True,
#'max_features': 0.2,
'max_depth': 5,
'min_samples_leaf': 4,
'max_features' : 'sqrt',
'verbose': 0
}
# Extra Trees Parameters
et_params = {
'n_jobs': -1,
'n_estimators':150,
#'max_features': 0.5,
'max_depth': 6,
'min_samples_leaf': 4,
'verbose': 0
}
# AdaBoost parameters
ada_params = {
'n_estimators': 150,
'learning_rate' : 0.25
}
# Gradient Boosting parameters
gb_params = {
'n_estimators': 150,
#'max_features': 0.2,
'max_depth': 6,
'min_samples_leaf': 4,
'verbose': 0
}
# Support Vector Classifier parameters
svc_params = {
'kernel' : 'linear',
'C' : 0.02
}
# Create 5 objects that represent our 5 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=2022, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=2022, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=2022, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=2022, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=2022, params=svc_params)
# Create our OOF train and test predictions. These base results will be used as new features
kf = KFold(n_splits=5)
# x_train = train_df[features]
# y_train = train_df['患有糖尿病标识']
# x_test = test_df[features]
et_oof_train, et_oof_test = get_oof(et,kf, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,kf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada,kf, x_train, y_train, x_test) # AdaBoost
gb_oof_train, gb_oof_test = get_oof(gb,kf,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,kf,x_train, y_train, x_test) # Support Vector Classifier
print("Training is complete")
#第二层的训练及验证数据
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
'ExtraTrees': et_oof_train.ravel(),
'AdaBoost': ada_oof_train.ravel(),
'GradientBoost': gb_oof_train.ravel(),
'Svc':svc_oof_train.ravel()
})
base_predictions_test = pd.DataFrame( {'RandomForest': rf_oof_test.ravel(),
'ExtraTrees': et_oof_test.ravel(),
'AdaBoost': ada_oof_test.ravel(),
'GradientBoost': gb_oof_test.ravel(),
'Svc':svc_oof_test.ravel()
})
clf = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='binary_logloss',learning_rate=0.1,
n_estimators=94, max_depth=3, num_leaves=10,max_bin=175,min_data_in_leaf=101,
bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8,lambda_l1=0.7,lambda_l2=0.7,
min_split_gain=0.1,num_iterations =5000)
clf.fit(base_predictions_train,y_train)
f1_score(clf.predict(base_predictions_test),y_test)
# test_df['label'] = clf.predict(base_predictions_test)
# test_df.rename({'编号': 'uuid'}, axis=1)[['uuid', 'label']].to_csv('submit.csv', index=None)