数据挖掘-Datawhale学习笔记-04-模型融合

Task5模型融合:https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.15.1cd8593aDCDfxr&postId=95535

常见模型融合方式有blending,bagging ,加权融合

stacking模型融合

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=10):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self     

	def predict(self, X):
	     meta_features = np.column_stack([
            	 np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
           	 for base_models in self.base_models_ ])
      	  return self.meta_model_.predict(meta_features)

stacked_averaged_models = StackingAveragedModels(base_models = (model_lgb, GBoost, ridge),
                                                 meta_model = lasso)

绘制学习曲线判断是否过拟合与欠拟合:

from sklearn.model_selection import learning_curve #加载学习曲线
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)): 
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("训练样本数")
    plt.ylabel("得分")
    kf = KFold(cv, shuffle=True, random_state=42)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=kf, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Validation score")
    plt.legend(loc="best")
    return plt

plot_learning_curve(estimator=stacked_averaged_models, title= "学习曲线 (stacked_averaged_models算法模型)", 
                    X=train_input.values, y=train_output.values, cv=10, n_jobs=1)                        

你可能感兴趣的:(Datawhale学习,数据挖掘)