最近想用下stacking,搜了很多,现在把所学到的记录下
比较好的一个资料是:
英文版:https://mlwave.com/kaggle-ensembling-guide/
翻译版:https://blog.csdn.net/a358463121/article/details/53054686
之前乱搜一通,发现stack和blend傻傻分不清楚,后来才知道很多人stack和blend是混着叫的,所以不必纠结这个名字,不要着相。只要知道stack是用cv交叉验证来得出元模型的特征(一个基模型产出一个元特征作为二级模型的输入),而blend是用留出法,比如百分之80作训练,另外百分之20的预测值作为元模型的标签(而stack是用全部的训练集预测来产出一个基模型对应的标签,二级模型只用那百分之20的预测值,这样可以把堆叠用的数据集和二级模型泛化用的数据集分开,而stacking就没有分开,所以stakcing有数据泄露,存在过拟合的风险)。
看下面的链接
https://github.com/emanuele/kaggle_pbr/blob/master/blend.py
说是blend,貌似是stakcing,whatever,能用就行,自己心里明白。好生mark下,有大用处。
下面介绍一个比较不错的库heamy。
这里有个官方的demo,https://github.com/rushter/heamy/tree/master/examples
csdn上有个较为清晰的例子,不过觉得有些歧义:https://blog.csdn.net/data_scientist/article/details/79036382
下面是几个方法的介绍
estimator.py中的方法(注意它们返回的都是数据集)
def stack(self, k=5, stratify=False, shuffle=True, seed=100, full_test=True):
"""Stack a single model. You should rarely be using this method. Use `ModelsPipeline.stack` instead.
Parameters
----------
k : int, default 5
stratify : bool, default False
shuffle : bool, default True
seed : int, default 100
full_test : bool, default True
If `True` then evaluate test dataset on the full data otherwise take the mean of every fold.
Returns
-------
`Dataset` with out of fold predictions.
"""
train = None
test = []
#此处主要是节省内存,决定是否启动cache,源码自行实现了cache的规则,即何时写文件、读文件
if self.use_cache:
pdict = {'k': k, 'stratify': stratify, 'shuffle': shuffle, 'seed': seed, 'full_test': full_test}
dhash = self._dhash(pdict)
c = Cache(dhash, prefix='s')
if c.available:
logger.info('Loading %s\'s stack results from cache.' % self._name)
train = c.retrieve('train')
test = c.retrieve('test')
y_train = c.retrieve('y_train')
return Dataset(X_train=train, y_train=y_train, X_test=test)
elif not self.dataset.loaded:
self.dataset.load()
#这里就是stack和blend的区别了,stack用的是k折cv
for i, fold in enumerate(self.dataset.kfold(k, stratify=stratify, seed=seed, shuffle=shuffle)):
X_train, y_train, X_test, y_test, train_index, test_index = fold
logger.info('Calculating %s\'s fold #%s' % (self._name, i + 1))
if full_test:
prediction = reshape_1d(self._predict(X_train, y_train, X_test, y_test))
else:
xt_shape = X_test.shape[0]
#此处把折外数据和最终要预测的test数据concat一起,方便处理
x_t = concat(X_test, self.dataset.X_test)
prediction_concat = reshape_1d(self._predict(X_train, y_train, x_t))
prediction, prediction_test = tsplit(prediction_concat, xt_shape)
test.append(prediction_test)
if train is None:
train = np.zeros((self.dataset.X_train.shape[0], prediction.shape[1]))
train[test_index] = prediction
if full_test:
logger.info('Calculating %s\'s test data' % self._name)
test = self._predict(self.dataset.X_train, self.dataset.y_train, self.dataset.X_test)
else:
test = np.mean(test, axis=0)
test = reshape_1d(test)
if self.use_cache:
c.store('train', train)
c.store('test', test)
c.store('y_train', self.dataset.y_train)
#此处返回的dataset就是一个基模型对应的预测值(针对全部训练集和test数据),其实就是一列。一个基模型产出一个特征送入二级模型。就算n级也是这个道理。
return Dataset(X_train=train, y_train=self.dataset.y_train, X_test=test)
def blend(self, proportion=0.2, stratify=False, seed=100, indices=None):
"""Blend a single model.
You should rarely be using this method. Use `ModelsPipeline.blend` instead.
Parameters
----------
proportion : float, default 0.2
Test size holdout.
stratify : bool, default False
seed : int, default 100
indices : list(np.ndarray,np.ndarray), default None
Two numpy arrays that contain indices for train/test slicing. (train_index,test_index)
Returns
-------
`Dataset`
"""
if self.use_cache:
pdict = {'proportion': proportion, 'stratify': stratify, 'seed': seed, 'indices': indices}
if indices is not None:
pdict['train_index'] = np_hash(indices[0])
pdict['test_index'] = np_hash(indices[1])
dhash = self._dhash(pdict)
c = Cache(dhash, prefix='b')
if c.available:
logger.info('Loading %s\'s blend results from cache.' % self._name)
train = c.retrieve('train')
test = c.retrieve('test')
y_train = c.retrieve('y_train')
return Dataset(X_train=train, y_train=y_train, X_test=test)
elif not self.dataset.loaded:
self.dataset.load()
#留出法,用一部分数据来训练,另一部分的预测值作为二级模型的输入,也是一个基模型对应一个二级模型的特征
X_train, y_train, X_test, y_test = self.dataset.split(test_size=proportion, stratify=stratify,
seed=seed, indices=indices)
xt_shape = X_test.shape[0]
x_t = concat(X_test, self.dataset.X_test)
prediction_concat = reshape_1d(self._predict(X_train, y_train, x_t))
new_train, new_test = tsplit(prediction_concat, xt_shape)
if self.use_cache:
c.store('train', new_train)
c.store('test', new_test)
c.store('y_train', y_test)
return Dataset(new_train, y_test, new_test)
pipeline.py中的方法(注意它们返回的都是数据集)
def stack(self, k=5, stratify=False, shuffle=True, seed=100, full_test=True, add_diff=False):
"""Stacks sequence of models.
Parameters
----------
k : int, default 5
Number of folds.
stratify : bool, default False
shuffle : bool, default True
seed : int, default 100
full_test : bool, default True
If True then evaluate test dataset on the full data otherwise take the mean of every fold.
add_diff : bool, default False
Returns
-------
`DataFrame`
Examples
--------
>>> pipeline = ModelsPipeline(model_rf,model_lr)
>>> stack_ds = pipeline.stack(k=10, seed=111)
"""
result_train = []
result_test = []
y = None
for model in self.models:
result = model.stack(k=k, stratify=stratify, shuffle=shuffle, seed=seed, full_test=full_test)
train_df = pd.DataFrame(result.X_train, columns=generate_columns(result.X_train, model.name))
test_df = pd.DataFrame(result.X_test, columns=generate_columns(result.X_test, model.name))
#train_df就是单模型产出的预测值,一列。作为特征训练二级模型
result_train.append(train_df)
result_test.append(test_df)
if y is None:
y = result.y_train
#result_train就是二级模型的训练集,目标值不变,result_test对应test数据的二级输入
result_train = pd.concat(result_train, axis=1)
result_test = pd.concat(result_test, axis=1)
if add_diff:
result_train = feature_combiner(result_train)
result_test = feature_combiner(result_test)
ds = Dataset(X_train=result_train, y_train=y, X_test=result_test)
return ds
def blend(self, proportion=0.2, stratify=False, seed=100, indices=None, add_diff=False):
"""Blends sequence of models.
Parameters
----------
proportion : float, default 0.2
stratify : bool, default False
seed : int, default False
indices : list(np.ndarray,np.ndarray), default None
Two numpy arrays that contain indices for train/test slicing.
即是否直接用index来指定训练集和测试集的index,否的话随机划分
add_diff : bool, default False
Returns
-------
`DataFrame`
Examples
--------
>>> pipeline = ModelsPipeline(model_rf,model_lr)
>>> pipeline.blend(seed=15)
>>> # Custom indices
>>> train_index = np.array(range(250))
>>> test_index = np.array(range(250,333))
>>> res = model_rf.blend(indicies=(train_index,test_index))
"""
result_train = []
result_test = []
y = None
for model in self.models:
result = model.blend(proportion=proportion, stratify=stratify, seed=seed, indices=indices)
train_df = pd.DataFrame(result.X_train, columns=generate_columns(result.X_train, model.name))
test_df = pd.DataFrame(result.X_test, columns=generate_columns(result.X_test, model.name))
result_train.append(train_df)
result_test.append(test_df)
if y is None:
y = result.y_train
result_train = pd.concat(result_train, axis=1, ignore_index=True)
result_test = pd.concat(result_test, axis=1, ignore_index=True)
if add_diff:
#对一级模型得出len(models)个特征作排列组合然后两两取差值作为新特征
#个人觉得这里是为了充分体现模型的差异性。周志华老师的书里也说过,基模型差别越大,
#bagging出来的效果通常会越好,方差会降低,想想方差公式就知道啦。
result_train = feature_combiner(result_train)
result_test = feature_combiner(result_test)
return Dataset(X_train=result_train, y_train=y, X_test=result_test)
上面的两个py的方法得出二级模型的输入,这些基模型的预测值的组合方法:一般的,blending和stacking都是用LR,其他的用加权平均(下面会介绍怎么找最佳的加权系数)、取平均、取最大值。可以参考https://blog.csdn.net/Gin077/article/details/84344398
---------------------------------加权平均-------------------------------------------------
下面是三层stacking模型,第三层用find_weight方法得出最优各个基模型的权重(这里是对各基模型的预测结果作加权)。看源码可以指定寻找权重的方法
# 1st level
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')
pipeline = ModelsPipeline(model_rf,model_lr,model_knn)
stack_ds = pipeline.stack(k=5,seed=111)
# 2nd level
stack_rf = Regressor(dataset=stack_ds, estimator=RandomForestRegressor, parameters={'n_estimators': 15},name='rf')
stack_lr = Regressor(dataset=stack_ds, estimator=LinearRegression, parameters={'normalize': True},name='lr')
stack_pipeline = ModelsPipeline(stack_rf,stack_lr)
# 3rd level
weights = stack_pipeline.find_weights(mean_absolute_error)
print('---')
result = stack_pipeline.weight(weights).validate(mean_absolute_error,10)
下面是获取最优权重用的方法
# coding:utf-8
from scipy.optimize import minimize
class Optimizer(object):
def __init__(self, models, scorer, test_size=0.2):
self.test_size = test_size
self.scorer = scorer
self.models = models
self.predictions = []
self.y = None
self._predict()
def _predict(self):
for model in self.models:
y_true_list, y_pred_list = model.validate(k=1, test_size=self.test_size)
if self.y is None:
self.y = y_true_list[0]
self.predictions.append(y_pred_list[0])
def loss_func(self, weights):
final_prediction = 0
for weight, prediction in zip(weights, self.predictions):
final_prediction += weight * prediction
return self.scorer(self.y, final_prediction)
def minimize(self, method):
starting_values = [0.5] * len(self.predictions)
cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
bounds = [(0, 1)] * len(self.predictions)
#调用scipy中的minimize,自行指定loss function和methos即可
res = minimize(self.loss_func, starting_values, method=method, bounds=bounds, constraints=cons)
print('Best Score (%s): %s' % (self.scorer.__name__, res['fun']))
print('Best Weights: %s' % res['x'])
return res['x']
获取到最后权重后就可得出预测结果了
result = stack_pipeline.weight(weights).validate(mean_absolute_error,10)
---------------------------------加权平均-------------------------------------------------