【任务六-模型融合】用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分
按照自己的理解
第一层:
使用交叉验证的划分方法,将训练集划分成5份,
使用第一个基分类器对划分之后得到的test进行预测,得到的5个predict文件,维数 ( n 5 , 1 ) (\frac{n}{5},1) (5n,1),纵向拼接得到1个Predict文件维数 ( n , 1 ) (n,1) (n,1)
使用第一个基分类器对整个Test进行预测,得到预测文件5个p_t,维数 ( r , 1 ) (r,1) (r,1),横向拼接,求平均值得到1个Pt文件维数 ( r , 1 ) (r,1) (r,1)
使用第二个基分类器,
。。。
得到5个Predict文件维数 ( n , 1 ) (n,1) (n,1),5个Pt文件维数 ( r , 1 ) (r,1) (r,1)
第二层
将第一层得到的5个Predict文件维数 ( n , 1 ) (n,1) (n,1)横向拼接,再和训练集拼接,得到新的训练集Train,维数 ( n , m + 5 ) (n,m+5) (n,m+5),
将第一层得到的5个t文件维数 ( r , 1 ) (r,1) (r,1)横向拼接,再和训练集拼接,得到新的训练集Test,维数 ( r , m + 4 ) (r,m+4) (r,m+4)
对第二层的训练集进行训练,得到新的模型,
第三层:
使用行的模型对测试集进行预测
#!/usr/bin/env python 3.6
# -*- coding:utf-8 -*-
# @File : CV1.py
# @Date : 2018-11-22
# @Author : 黑桃
# @Software: PyCharm
from pandas import Series, DataFrame
import pickle
import pandas as pd
from sklearn.externals import joblib
from pandas import Series, DataFrame
from sklearn import svm
from sklearn.model_selection import * # 划分数据 交叉验证
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings("ignore")
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 读取数据
"""
print("0 读取特征")
f = open(path + 'feature/feature_V3.pkl', 'rb')
train, test, y_train, y_test = pickle.load(f)
f.close()
"""=====================================================================================================================
2 进行K次训练;用K个模型分别对测试集进行预测,并得到K个结果,再进行结果的融合
"""
"""=====================================================================================================================
3 交叉验证方式
"""
## 对交叉验证方式进行指定,如验证次数,训练集测试集划分比例等
kf = KFold(n_splits=5, random_state=1)
loo = LeaveOneOut() # 将数据集分成训练集和测试集,测试集包含一个样本,训练集包含n-1个样本
lpo = LeavePOut(p=2000) ## #将数据集分成训练集和测试集,测试集包含p个样本,训练集包含n-p个样本
ss = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
tss = TimeSeriesSplit(n_splits=5)
logo = LeaveOneGroupOut()
lpgo = LeavePGroupsOut(n_groups=3)
gss = GroupShuffleSplit(n_splits=4, test_size=.5, random_state=0)
gkf = GroupKFold(n_splits=2)
"""【配置交叉验证方式】"""
cv = kf
"""=====================================================================================================================
2 读取模型
"""
print("1 读取模型")
SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl")
SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl")
SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl")
SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl")
lg_120 = joblib.load( path + "model/model_file/lg_120.pkl")
DT = joblib.load( path + "model/model_file/DT.pkl")
xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl")
lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl")
xgb = joblib.load( path + "model/model_file/xgb.pkl")
lgb = joblib.load( path + "model/model_file/lgb.pkl")
# 原始数据的索引不是从0开始的,因此重置索引
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
"""=====================================================================================================================
3 【第一层】用预测结果构建特征
"""
def get_feature(clf,train,y_train,test,y_test,cv):
preds_Train = []
preds_Test = []
i = 0
score_sum = 0
for train_idx, vali_idx in cv.split(train, y_train):
i += 1
"""获取训练集和验证集"""
f_train_x = DataFrame(train[train_idx])
f_train_y = DataFrame(y_train[train_idx])
f_vali_x = DataFrame(train[vali_idx])
f_vali_y = DataFrame(y_train[vali_idx])
"""训练分类器"""
clf.fit(f_train_x, f_train_y)
"""对测试集进行预测"""
Train1 = clf.predict(f_vali_x)
Test_i = clf.predict(test)
preds_Test.append(Test_i)
preds_Train.append(Train1)
# """对验证集进行预测,并计算f1分数"""
# pre_vali = clf.predict(f_vali_x)
# score_vali = f1_score(y_true=f_vali_y, y_pred=pre_vali, average='macro')
# print("第{}折, 验证集分数:{}".format(i, score_vali))
# score_sum += score_vali
# score_mean = score_sum / i
# print("第{}折后, 验证集分平均分数:{}".format(i, score_mean))
preds_Train = DataFrame(preds_Train).T
preds_Test = DataFrame(preds_Test).T
Train_i = pd.concat(objs=[preds_Train[0], preds_Train[1], preds_Train[2], preds_Train[3], preds_Train[4]], axis=0, sort=True)
Test_i = pd.concat(objs=[preds_Test[0], preds_Test[1], preds_Test[2], preds_Test[3], preds_Test[4]], axis=1)
Test_i = Test_i.T.sum()/3
return Test_i,Train_i
"""=====================================================================================================================
4 【第二层】特征组合
"""
Test_1,Train1 = get_feature(SVM_linear,train,y_train,test,y_test,cv)
Test_2,Train2 = get_feature(lg_120,train,y_train,test,y_test,cv)
Test_3,Train3 = get_feature(DT,train,y_train,test,y_test,cv)
Test_4,Train4 = get_feature(SVM_rbf,train,y_train,test,y_test,cv)
Test_5,Train5 = get_feature(lgb_sklearn,train,y_train,test,y_test,cv)
Train = pd.concat(objs=[Train1, Train2, Train3, Train4, Train5], axis=1).reset_index(drop=True)
Test = pd.concat(objs=[Test_1, Test_2, Test_3, Test_4, Test_5], axis=1).astype(int).reset_index(drop=True)
train = DataFrame(train).reset_index(drop=True)
test = DataFrame(test).reset_index(drop=True)
Train = pd.concat(objs=[Train, train], axis=1)
Test = pd.concat(objs=[Test, test], axis=1)
"""=====================================================================================================================
【LGB_sklearn接口训练】
"""
import lightgbm as lgbm
lgb_sklearn = lgbm.LGBMClassifier(learning_rate=0.1,
max_bin=150,
num_leaves=32,
max_depth=11,
reg_alpha=0.1,
reg_lambda=0.2,
# objective='multiclass',
n_estimators=300,)
lgb_sklearn.fit(Train,y_train)
# y_lgb_pre = lgb_sklearn.predict(Test)
y_lgb_pre = lgb_sklearn.predict(Test)
print( "lgb_sklearn_Train_Score :{}".format(lgb_sklearn.score(Train, y_train)))
print("lgb_sklearn_Test_Score :{}".format(lgb_sklearn.score(Test, y_test)))
# print("lgb_sklearn_Train_AUC Score :{:.4f}".format(roc_auc_score(y_train, y_lgb_pre)))
print("lgb_sklearn_Test_AUC Score :{}".format(roc_auc_score(y_test, y_lgb_pre)))
#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File : Stacking2.py
# @Date : 2018-11-25
# @Author : 黑桃
# @Software: PyCharm
from sklearn import datasets
import warnings
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
from sklearn.externals import joblib
warnings.filterwarnings("ignore")
iris = datasets.load_iris()
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 读取特征
"""
print("0 读取特征")
f = open(path + 'feature/feature_V3.pkl', 'rb')
train, test, y_train,y_test= pickle.load(f)
f.close()
X, y = train, y_train
"""=====================================================================================================================
2 读取模型
"""
print("1 读取模型")
SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl")
SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl")
SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl")
SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl")
lg_120 = joblib.load( path + "model/model_file/lg_120.pkl")
DT = joblib.load( path + "model/model_file/DT.pkl")
xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl")
lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl")
xgb = joblib.load( path + "model/model_file/xgb.pkl")
lgb = joblib.load( path + "model/model_file/lgb.pkl")
clf1 =SVM_linear
clf2 = lg_120
clf3 = DT
clf4 = SVM_rbf
clf5 = lgb_sklearn
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,clf4,clf5],meta_classifier=lgb_sklearn)
print('5-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3,clf4,clf5 , sclf],
['SVM_linear','lg_120','DT','SVM_rbf','lgb_sklearn','StackingClassifier']):
scores = model_selection.cross_val_score(clf, X, y,cv=5, scoring='accuracy')
print("Accuracy: %s (+/- %0.9f) [%s]"
% (scores.mean(), scores.std(), label))
自己的代码结果:
一级分类器 | 二级lgb | Stacking之前 | Stacking之后 |
---|---|---|---|
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn | LGB_sklearn接口(predict) AUC Score | 0.7951391197086869 | 0.78980256597753 |
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn | LGB_sklearn接口(proba) AUC Score | 0.6481179876945349 | 0.6372122138757783 |
调包实现的结果:
Accuracy: 0.7845685143591137 (+/- 0.008116008) [SVM_linear]
Accuracy: 0.7946686730541058 (+/- 0.008620505) [lg_120]
Accuracy: 0.7671842760458581 (+/- 0.017846894) [DT]
Accuracy: 0.7514728483069482 (+/- 0.000409207) [SVM_rbf]
Accuracy: 0.7831687376559587 (+/- 0.012327716) [lgb_sklearn]
Accuracy: 0.7831687376559587 (+/- 0.012327716) [StackingClassifier]
推荐|Kaggle机器学习之模型融合(stacking)心得
StackingClassifier