模型融合整理--------投票、stacking、blending

 

 

from sklearn.datasets import make_blobs
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
#from sklearn.cross_validation  import StratifiedKFold
'''
Voting即投票机制,分为软投票和硬投票两种,其原理采用少数服从多数的思想。

硬投票:对多个模型直接进行投票,最终投票数最多的类为最终被预测的类。

软投票:和硬投票原理相同,增加了设置权重的功能,可以为不同模型设置不同权重,进而区别模型不同的重要度。

备注:此方法用于解决分类问题。
'''
iris = datasets.load_iris()
x=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
clf1 = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=1, min_child_weight=2, gamma=0, subsample=0.7,
                     colsample_bytree=0.6, objective='binary:logistic', nthread=4, scale_pos_weight=1)
clf2 = RandomForestClassifier(n_estimators=50, max_depth=1, min_samples_split=4,
                              min_samples_leaf=54,oob_score=True)
clf3 = SVC(C=0.1, probability=True)
# 硬投票
eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2), ('svc', clf3)], voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf], ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

#软投票
x=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
clf1 = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=1, min_child_weight=2, gamma=0, subsample=0.7,
                     colsample_bytree=0.6, objective='binary:logistic', nthread=4, scale_pos_weight=1)
clf2 = RandomForestClassifier(n_estimators=50, max_depth=1, min_samples_split=4,
                              min_samples_leaf=54,oob_score=True)
clf3 = SVC(C=0.1, probability=True)
# 软投票
eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 1])
clf1.fit(x_train, y_train)


'''
stacking是一种分层模型集成框架。
以两层为例,第一层由多个基学习器组成,其输入为原始训练集,第二层的模型则是以第一层基学习器的输出作为训练集进行再训练,从而得到完整的stacking模型。
stacking两层模型都使用了全部的训练数据。
第一层模型:首先数据有训练集和测试集两部分
1.对训练集进行五折交叉验证,把训练集划分为A,B两部分
2.对A部分进行训练,对B部分进行预测,得到a1,五折后则为a1,a2,a3,a4,a5,对他们合并,形成n行一列的数据
3.对测试集进行预测,会得到b1,b2,b3,b4,b5,将各部分相加取平均得到m行一列的数据
4.以上是一个模型,如果有三个模型,则可以得到A1,A2,A3,B1,B2,B3
5.在此之后,我们把A1,A2,A3并列合并得到一个n行三列的矩阵作为training data,B1,B2,B3并列合并得到一个m行三列的矩阵作为testing data。让下一层的模型,基于他们进一步训练。
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier
import pandas as pd
#创建训练的数据集
data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)
print(data[:10],target[:10],type(data),type(target))
datas = pd.read_csv('D:\workplace\GEO\K12\shaoeryingyu\edu_shaoer_TRANSWOE.csv', index_col=0)
target_name = 'edu_shaoeryingyu_model_0_index_all_1_month.call_result'  # 指定目标变量
selec0 = ['tag_102_days_w1', 'tag_117_days_w1', 'tag_107_days_w1', 'tag_11302_flag_m1', 'tag_105_days_w1','tag_105_cnt_w1', 'tag_1130401_flag_m1', 'tag_11304_days_m1', 'tag_1130406_days_m1', 'tag_1130401_days_w2','all_tag_level2_num_m1', 'all_tag_level2_num_w1', 'tag_108_days_w1', 'tag_104_max_lastdays_w1',
          'tag_level3_num_w2', 'tag_level3_num_m1', 'tag_1130200_flag_w2', 'tag_1130401_flag_w2','tag_level3_num_w1', 'tag_1130401_days_w1', 'tag_105_max_lastdays_w1', 'tag_level2_num_w1', 'tag_level2_num_m1', 'tag_1130200_cnt_m1', 'tag_11302_pct_w2', 'tag_102_max_lastdays_w1', 'all_tag_level2_num_w2', 'tag_11302_pct_m1', 'tag_11302_cnt_m1', 'tag_117_max_lastdays_w1']
selec1 = ['all_tag_level2_num_w1', 'tag_level3_num_w1', 'tag_level2_num_w2', 'all_tag_level2_num_w2', 'tag_1130406_pct_w2', 'tag_1130200_flag_w2', 'tag_11304_max_lastdays_w2', 'tag_level2_num_m1', 'all_tag_level2_num_m1', 'tag_11302_days_m1', 'tag_11304_days_m1', 'tag_level3_num_m1', 'tag_1130401_days_m1', 'tag_1130401_flag_m1', 'tag_11302_to_days_m1', 'tag_11302_max_lastdays_m1', 'tag_11304_max_lastdays_m1', 'tag_102_cnt_w1', 'tag_117_days_w1', 'tag_105_days_w1', 'tag_108_days_w1', 'tag_107_max_lastdays_w1']

target = np.array(datas.loc[:, target_name])  # y
datas = datas.iloc[:, 2:]  # x
colum = list(datas.columns)
colum = [x.replace('edu_shaoeryingyu_model_0_index_all_1_month.','') for x in colum]
datas.columns = colum
datas.drop(selec1, axis=1, inplace=True)
data = np.array(datas.fillna(0))
#print(data[:10],target[:10],type(data),type(target))
#模型融合中使用到的各个单模型
clfs = [LogisticRegression(),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        #RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]

#切分一部分数据作为测试集
X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017)

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))
print(len(clfs))
#5折stacking
# n_folds = 5
# skf = list(StratifiedKFold(y, n_folds))
skf = StratifiedKFold(n_splits=5)
skf = skf.get_n_splits(X, y)
print('skf',skf)
#print(len(skf),len(y),len(y)/5)
for j, clf in enumerate(clfs):
    #依次训练各个单模型
    dataset_blend_test_j = np.zeros((X_predict.shape[0], 5))
    for i, (train, test) in enumerate(skf):
        #使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    #对于测试集,直接用这k个模型的预测值均值作为新的特征。
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
# clf = LogisticRegression()
clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))


'''
Blending
Bending是一种模型融合方法,对于一般的Blending,主要思路是把原始的训练集先分成两部分,比如70%的数据作为新的训练集,剩下30%的数据作为测试集。第一层我们在这70%的数据上训练多个模型,然后去预测那30%数据的label。在第二层里,我们就直接用这30%数据在第一层预测的结果做为新特征继续训练即可。

Blending的优点在于:
1.比stacking简单(因为不用进行k次的交叉验证来获得stacker feature)
2.避开了一个信息泄露问题:generlizers和stacker使用了不一样的数据集
而缺点在于:
1.使用了很少的数据(第二阶段的blender只使用training set10%的量)
2.blender可能会过拟合
3.stacking使用多次的交叉验证会比较稳健
对于实践中的结果而言,stacking和blending的效果是差不多的,所以使用哪种方法都没什么所谓,完全取决于个人爱好。

'''

#创建训练的数据集
#data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)

#模型融合中使用到的各个单模型
clfs = [LogisticRegression(),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        #ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]

#切分一部分数据作为测试集
X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017)

#5折stacking
n_folds = 5
skf = list(StratifiedKFold(y, n_folds))

#切分训练数据集为d1,d2两部分
X_d1, X_d2, y_d1, y_d2 = train_test_split(X, y, test_size=0.5, random_state=2017)
dataset_d1 = np.zeros((X_d2.shape[0], len(clfs)))
dataset_d2 = np.zeros((X_predict.shape[0], len(clfs)))

for j, clf in enumerate(clfs):
    #依次训练各个单模型
    clf.fit(X_d1, y_d1)
    y_submission = clf.predict_proba(X_d2)[:, 1]
    dataset_d1[:, j] = y_submission
    #对于测试集,直接用这k个模型的预测值作为新的特征。
    dataset_d2[:, j] = clf.predict_proba(X_predict)[:, 1]
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_d2[:, j]))

#融合使用的模型
clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf.fit(dataset_d1, y_d2)
y_submission = clf.predict_proba(dataset_d2)[:, 1]

print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))

 

 

 

你可能感兴趣的:(程序员,python)