from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
# 导入数据
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#———————投票分类器————————————————
# 三个基学习器
log_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = SVC()
# 投票分类器 硬投票(权值均为1) 软投票(有权值)
voting_clf = VotingClassifier( estimators=[("lr", log_clf), ("rf", rf_clf), ("svc", svm_clf)], voting="hard" )
# voting_clf.fit( X_train, y_train )
for clf in ( log_clf, rf_clf, svm_clf, voting_clf ):
clf.fit( X_train, y_train )
y_pred = clf.predict( X_test )
print( clf.__class__.__name__, accuracy_score(y_test, y_pred) )
#————————Bagging——————————————
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1 )
bag_clf.fit( X_train, y_train )
y_pred = bag_clf.predict( X_test )
pred_score = accuracy_score( y_pred, y_test )
print( pred_score )
#%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.tree import DecisionTreeClassifier
import numpy as np
def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
if contour:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
plt.axis(axes)
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
return
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
plt.figure(figsize=(8,3))
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.show()
bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True )
bag_clf.fit( X_train, y_train )
print( bag_clf.oob_score_ )
y_pred = bag_clf.predict( X_test )
print( accuracy_score(y_test, y_pred) )
# 输出bagging的概率矩阵
df = bag_clf.oob_decision_function_
# print( df )
#————————随机森林————————————
#随机森林一般是用bagging方法进行训练,每个决策树中使用的样本数都是全部的训练集,RandomForestClassifier中的参数与决策树基本一致,都是对决策树的形状等性质做一些规定
#相比于Bagging方法,随机森林引入了一些额外的随机性,因为它不是在所有的特征中选择最好的分类特征用于分离一个节点,而是在随机的一些特征中选择最优的节点分割方法。这会增大模型的偏差,但是减小了模型的方差。
#如果在bagging方法中设置模型随机选择特征进行基学习器的训练,那么它与随机森林等价(其他参数相同的情况下)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
rf_clf = RandomForestClassifier( n_estimators=500, max_leaf_nodes=16, n_jobs=-1 )
rf_clf.fit( X_train, y_train )
y_pred_clf = rf_clf.predict( X_test )
print( accuracy_score( y_pred_clf, y_test ) )
extra_tree_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
extra_tree_clf.fit( X_train, y_train )
y_pred_clf = extra_tree_clf.predict( X_test )
print( accuracy_score( y_pred_clf, y_test ) )
from sklearn.datasets import load_iris
iris = load_iris()
rf_clf = RandomForestClassifier( n_estimators=500, n_jobs=-1 )
rf_clf.fit( iris.data, iris.target )
# rf_clf.feature_importances_中已经按照样本中特征的顺序进行了排序,与特征一一顺序对应
for name, score in zip( iris.feature_names, rf_clf.feature_importances_ ):
print( name, score )
import matplotlib
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(mnist["data"], mnist["target"])
def plot_digit(data):
image = data.reshape(28, 28)
plt.imshow(image, cmap = matplotlib.cm.hot,
interpolation="nearest")
plt.axis("off")
return
plot_digit(rf_clf.feature_importances_)
cbar = plt.colorbar(ticks=[rf_clf.feature_importances_.min(), rf_clf.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])
plt.show()
#——————————Boosting———————————
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
plt.figure(figsize=(4,3))
plot_decision_boundary( ada_clf, X, y )
plt.show()
# AdaBoost的基本思想示例
m = len(X_train)
plt.figure(figsize=(8, 3))
for subplot, learning_rate in ((121, 1), (122, 0.5)):
sample_weights = np.ones(m)
for i in range(5):
plt.subplot(subplot)
svm_clf = SVC(kernel="rbf", C=0.05, random_state=42)
svm_clf.fit(X_train, y_train, sample_weight=sample_weights)
y_pred = svm_clf.predict(X_train)
sample_weights[y_pred != y_train] *= (1 + learning_rate)
plot_decision_boundary(svm_clf, X, y, alpha=0.2)
plt.title("learning_rate = {}".format(learning_rate), fontsize=16)
plt.subplot(121)
plt.text(-0.7, -0.65, "1", fontsize=14)
plt.text(-0.6, -0.10, "2", fontsize=14)
plt.text(-0.5, 0.10, "3", fontsize=14)
plt.text(-0.4, 0.55, "4", fontsize=14)
plt.text(-0.3, 0.90, "5", fontsize=14)
plt.show()
#————————Gradient Boosting————————————
#GB与AdaBoost类似,也是依次在集成方法增加学习器,但是不同的是,AdaBoost是在每次迭代过程中调节训练数据的权重,而GB是在每次迭代过程中,采用之前的基学习器的残差来训练得到新的基学习器
# Gradient Boosted Regression Trees也叫GBRT
# sklearn中集成了GBRT,里面有一个学习率的超参数。如果这个参数设置的比较小,则需要更多的树来拟合训练集,但是预测效果一般会更好。
# 对于相同的GBRT,树的个数太多,容易造成过拟合现象
# GBRT的基本实现
from sklearn.tree import DecisionTreeRegressor
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
x1 = np.linspace(axes[0], axes[1], 500)
y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
plt.plot(X[:, 0], y, data_style, label=data_label)
plt.plot(x1, y_pred, style, linewidth=2, label=label)
if label or data_label:
plt.legend(loc="upper center", fontsize=16)
plt.axis(axes)
return
tree_reg1 = DecisionTreeRegressor( max_depth=2 )
tree_reg1.fit( X, y )
y2 = y - tree_reg1.predict( X )
tree_reg2 = DecisionTreeRegressor( max_depth=2 )
tree_reg2.fit( X, y2 )
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)
# t_pred = sum( tree.predict(X_new) for tree in ( tree_reg1, tree_reg2, tree_reg3 ) )
plt.figure(figsize=(9,9))
plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.title("Residuals and tree predictions", fontsize=12)
plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.title("Ensemble predictions", fontsize=12)
plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=12)
plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=12)
plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=12)
plt.ylabel("$y$", fontsize=12, rotation=0)
plt.show()
# sklearn中集成了GBRT
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor( max_depth=2, n_estimators=3, learning_rate=1.0 )
gbrt.fit( X, y )
plt.figure( figsize=(8,3) )
plt.subplot(121)
plot_predictions( [gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$n\_estimators=3$" )
gbrt = GradientBoostingRegressor( max_depth=2, n_estimators=30, learning_rate=1.0 )
plt.subplot( 122 )
gbrt.fit( X, y )
plot_predictions( [gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$n\_estimators=200$" )
#为了防止过拟合,可以采用early stopping的方法。先设置一个较大的树的个数,然后从这里面找出MSE最小的一棵树,这棵树所在的下标就是我们最终需要的树的个数
# GBRT中也包含subsample的超参数,相当于在训练一棵树的时候,只采用这么多的训练样本,类似于随机梯度下降的方法。
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split( X, y )
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit( X_train, y_train )
errors = [ mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict( X_val ) ]
best_n_estimators = np.argmin( errors )
print( "best number of estimators is : ", best_n_estimators )
plt.figure( figsize=(5,3) )
plt.plot( errors )
plt.show()
gbrt_best = GradientBoostingRegressor( max_depth=2, n_estimators=best_n_estimators )
gbrt_best.fit( X_train, y_train )
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=84, presort='auto', random_state=None,
subsample=1.0, verbose=0, warm_start=False)
#————————