集成方法是训练很多基学习器,然后用这些基学习器去对进行分类或者回归,最后取所有结果中比例最大的作为模型的结果
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
# 导入数据
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 三个基学习器
log_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = SVC()
# 投票分类器
voting_clf = VotingClassifier( estimators=[("lr", log_clf), ("rf", rf_clf), ("svc", svm_clf)], voting="hard" )
# voting_clf.fit( X_train, y_train )
for clf in ( log_clf, rf_clf, svm_clf, voting_clf ):
clf.fit( X_train, y_train )
y_pred = clf.predict( X_test )
print( clf.__class__.__name__, accuracy_score(y_test, y_pred) )
LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.888
VotingClassifier 0.896
both bagging and pasting allow training instances to be sampled several times across multiple predictors, but only bagging allows training instances to be sampled several times for the same predictor
max_features
和bootstrap_features
参数进行设置,这在处理高位特征输入的训练数据中十分有效,可以降低模型的复杂度以及过拟合的风险。同时对样本和特征进行采样称为Random Patches Method
,保留所有的样本,只对特征进行采样称为Random Subspaces method
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1 )
bag_clf.fit( X_train, y_train )
y_pred = bag_clf.predict( X_test )
pred_score = accuracy_score( y_pred, y_test )
print( pred_score )
0.912
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.tree import DecisionTreeClassifier
import numpy as np
def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
if contour:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
plt.axis(axes)
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
return
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
plt.figure(figsize=(8,3))
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.show()
bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True )
bag_clf.fit( X_train, y_train )
print( bag_clf.oob_score_ )
y_pred = bag_clf.predict( X_test )
print( accuracy_score(y_test, y_pred) )
# 输出bagging的概率矩阵
df = bag_clf.oob_decision_function_
# print( df )
0.896
0.896
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
rf_clf = RandomForestClassifier( n_estimators=500, max_leaf_nodes=16, n_jobs=-1 )
rf_clf.fit( X_train, y_train )
y_pred_clf = rf_clf.predict( X_test )
print( accuracy_score( y_pred_clf, y_test ) )
extra_tree_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
extra_tree_clf.fit( X_train, y_train )
y_pred_clf = extra_tree_clf.predict( X_test )
print( accuracy_score( y_pred_clf, y_test ) )
0.928
0.912
from sklearn.datasets import load_iris
iris = load_iris()
rf_clf = RandomForestClassifier( n_estimators=500, n_jobs=-1 )
rf_clf.fit( iris.data, iris.target )
# rf_clf.feature_importances_中已经按照样本中特征的顺序进行了排序,与特征一一顺序对应
for name, score in zip( iris.feature_names, rf_clf.feature_importances_ ):
print( name, score )
sepal length (cm) 0.103714253389
sepal width (cm) 0.0229312213868
petal length (cm) 0.435857047862
petal width (cm) 0.437497477363
import matplotlib
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(mnist["data"], mnist["target"])
def plot_digit(data):
image = data.reshape(28, 28)
plt.imshow(image, cmap = matplotlib.cm.hot,
interpolation="nearest")
plt.axis("off")
return
plot_digit(rf_clf.feature_importances_)
cbar = plt.colorbar(ticks=[rf_clf.feature_importances_.min(), rf_clf.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])
plt.show()
参考链接:http://blog.csdn.net/GYQJN/article/details/45501185
对于二分类器来说,输入为训练集$T=\{(x_{1},y_{1}),(x_{2},y_{2}), ...(x_{N},y_{N})\}$。其中$x_{i}\in X \subseteq R^n ,y_{i}\in Y=\{-1,+1\}$
输出为最终分类器 G(x)
对于m=1,2,3,…M(M为AdaBoost包含的基学习器个数)
构建基本分类器的线性组合,得到最终的分类器
由于AdaBoost在每次训练时,都需要用到之前的基学习器,因此无法实现多个基学习器共同训练的并行化
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
plt.figure(figsize=(4,3))
plot_decision_boundary( ada_clf, X, y )
plt.show()
# AdaBoost的基本思想示例
m = len(X_train)
plt.figure(figsize=(8, 3))
for subplot, learning_rate in ((121, 1), (122, 0.5)):
sample_weights = np.ones(m)
for i in range(5):
plt.subplot(subplot)
svm_clf = SVC(kernel="rbf", C=0.05, random_state=42)
svm_clf.fit(X_train, y_train, sample_weight=sample_weights)
y_pred = svm_clf.predict(X_train)
sample_weights[y_pred != y_train] *= (1 + learning_rate)
plot_decision_boundary(svm_clf, X, y, alpha=0.2)
plt.title("learning_rate = {}".format(learning_rate), fontsize=16)
plt.subplot(121)
plt.text(-0.7, -0.65, "1", fontsize=14)
plt.text(-0.6, -0.10, "2", fontsize=14)
plt.text(-0.5, 0.10, "3", fontsize=14)
plt.text(-0.4, 0.55, "4", fontsize=14)
plt.text(-0.3, 0.90, "5", fontsize=14)
plt.show()
# GBRT的基本实现
from sklearn.tree import DecisionTreeRegressor
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
x1 = np.linspace(axes[0], axes[1], 500)
y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
plt.plot(X[:, 0], y, data_style, label=data_label)
plt.plot(x1, y_pred, style, linewidth=2, label=label)
if label or data_label:
plt.legend(loc="upper center", fontsize=16)
plt.axis(axes)
return
tree_reg1 = DecisionTreeRegressor( max_depth=2 )
tree_reg1.fit( X, y )
y2 = y - tree_reg1.predict( X )
tree_reg2 = DecisionTreeRegressor( max_depth=2 )
tree_reg2.fit( X, y2 )
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)
# t_pred = sum( tree.predict(X_new) for tree in ( tree_reg1, tree_reg2, tree_reg3 ) )
plt.figure(figsize=(9,9))
plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.title("Residuals and tree predictions", fontsize=12)
plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.title("Ensemble predictions", fontsize=12)
plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=12)
plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=12)
plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=12)
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.show()
# sklearn中集成了GBRT
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor( max_depth=2, n_estimators=3, learning_rate=1.0 )
gbrt.fit( X, y )
plt.figure( figsize=(8,3) )
plt.subplot(121)
plot_predictions( [gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$n\_estimators=3$" )
gbrt = GradientBoostingRegressor( max_depth=2, n_estimators=30, learning_rate=1.0 )
plt.subplot( 122 )
gbrt.fit( X, y )
plot_predictions( [gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$n\_estimators=200$" )
early stopping
的方法。先设置一个较大的树的个数,然后从这里面找出MSE最小的一棵树,这棵树所在的下标就是我们最终需要的树的个数subsample
的超参数,相当于在训练一棵树的时候,只采用这么多的训练样本,类似于随机梯度下降的方法。from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split( X, y )
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit( X_train, y_train )
errors = [ mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict( X_val ) ]
best_n_estimators = np.argmin( errors )
print( "best number of estimators is : ", best_n_estimators )
plt.figure( figsize=(5,3) )
plt.plot( errors )
plt.show()
gbrt_best = GradientBoostingRegressor( max_depth=2, n_estimators=best_n_estimators )
gbrt_best.fit( X_train, y_train )
best number of estimators is : 84
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=84, presort='auto', random_state=None,
subsample=1.0, verbose=0, warm_start=False)