随机森林和随机有放回的采样(Bagging)息息相关,是从原始样本中进行m次随机有放回地提取n个样本,为了降低异常值的影响,从n个样本中选取一定比例的随机样本,并且选取其中的b个特征用于构建模型,建立基于决策树的m个分类器:
最终的分类器可以表示为不同的分类器的加权求和: s i g n [ Σ T i ( x ) ] sign[\Sigma T_i(x)] sign[ΣTi(x)]
随机森林具有双重随机性:
不采用全样本训练随机森林的原因是会使得树的多样性减少,模型的泛化能力变差。
b个特征的选择:对于每个独立的分类器,对特征增加噪声, D = 增 加 噪 声 后 的 预 测 结 果 − 原 始 预 测 结 果 D=增加噪声后的预测结果-原始预测结果 D=增加噪声后的预测结果−原始预测结果,D越大特征重要性越大,从而对特征进行排名选取前b个。
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
wine = load_wine()
# 实例化
# 训练集带入实例化后的模型进行训练,使用的接口是fit
# 使用其他接口将测试集导入训练好的模型
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3)
# clf = DecisionTreeClassifier(random_state=0)
# rfc = RandomForestClassifier(random_state=0)
# clf = clf.fit(Xtrain, Ytrain)
# rfc = rfc.fit(Xtrain, Ytrain)
#
# score_c = clf.score(Xtest, Ytest)
# score_r = rfc.score(Xtest, Ytest)
#
# print("Single Tree:{}".format(score_c),
# 'Random Forest:{}'.format(score_r))
# 交叉验证
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
# rfc = RandomForestClassifier(n_estimators=25)
# rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10)
#
# clf = DecisionTreeClassifier()
# clf_s = cross_val_score(clf, wine.data, wine.target, cv=10)
#
# plt.plot(range(1, 11), rfc_s, label='RandomForest')
# plt.plot(range(1, 11), clf_s, label='DecisionTree')
# plt.legend()
# plt.show()
# rfc_l = []
# clf_l = []
# for i in range(10):
# rfc = RandomForestClassifier(n_estimators=25)
# rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10).mean()
# rfc_l.append(rfc_s)
# clf = DecisionTreeClassifier()
# clf_s = cross_val_score(clf, wine.data, wine.target, cv=10).mean()
# clf_l.append(clf_s)
# plt.plot(range(1, 11), rfc_l, label='RandomForest')
# plt.plot(range(1, 11), clf_l, label='DecisionTree')
# plt.legend()
# plt.show()
# 参数n_estimators的学习曲线
superpa = []
for i in range(200):
rfc = RandomForestClassifier(n_estimators=i+1, n_jobs=1)
rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10).mean()
superpa.append(rfc_s)
print(max(superpa), superpa.index((max(superpa)))+1)
plt.figure(figsize=[20, 5])
plt.plot(range(1, 201), superpa)
plt.show()
import numpy as np
from scipy.special import comb
np.array([comb(25, i)*(0.2**i)*(1-0.2)**(25-i) for i in range(13, 26)]).sum()
# 查看森林中所有的树的所有参数状态
rfc.estimators_
rfc = RandomForestClassifier(n_estimators=25, oob_score=True)
rfc = rfc.fit(wine.data, wine.target)
# 重要属性oob_score_,袋外数据得分
rfc.oob_score_
# 特征的重要性
rfc.feature_importances_
rfc.apply(Xtest)
# 对应的标签概率
rfc.predict_proba(Xtest)
import sklearn
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
boston = load_boston()
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
cross_val_score(regressor, boston.data, boston.target, cv=10
,scoring='neg_mean_squared_error')
# sklearn中的模型评估指标(打分)列表,即所有的评价指标
sorted(sklearn.metrics.SCORERS.keys())
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
# sklearn中填补缺失值的类
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0] # 506
n_features = X_full.shape[1] # 13
# 对原始数据集进行处理,形成具有缺失值的数据集
# 首先确定希望放入的缺失数据的比例,假设是50%,则共有3289个数据缺失
rng = np.random.RandomState(0)
missing_rate = 0.5
# np.floor向下取整,返回.0格式的浮点数
n_missing_samples = int(np.floor(n_samples*n_features*missing_rate))
missing_features = rng.randint(0, n_features, n_missing_samples)
missing_samples = rng.randint(0, n_samples, n_missing_samples)
# 若选择的缺失量较小,小于数据量506,则选择用choice来选择不重复的随机数
# missing_samples = rng.choice(dataset.data.shape[0], n_missing_samples, replace=False)
# 根据missing索引构建包含缺失值的数据集
X_missing = X_full.copy()
y_missing = y_full.copy()
X_missing[missing_samples, missing_features] = np.nan
X_missing = pd.DataFrame(X_missing)
# 使用均值进行填补(还可用中值、众数等)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_missing_mean = imp_mean.fit_transform(X_missing)
# 使用0进行填补
imp_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
X_missing_0 = imp_0.fit_transform(X_missing)
# 使用随机森林填补缺失值
X_missing_reg = X_missing.copy()
# 对每个特征的缺失值进行计算并按升序进行排列
# argsort会返回从小到大排序的顺序对应的索引
sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values
for i in sortindex:
# 新开一个变量,用于中间操作,不对原始缺失数据直接处理
df = X_missing_reg
# 取出当前列特征作为Y,target
fillc = df.iloc[:, i]
# 连接除了当前列外的特征和target,作为新的特征
df = pd.concat([df[df != fillc], pd.DataFrame(y_full)], axis=1)
# 对于新特征矩阵,采用0对缺失值进行填充,构成新的特征矩阵
df_0 = SimpleImputer(missing_values=np.nan,
strategy='constant', fill_value=0).fit_transform(df)
# 训练集即当前特征具有缺失值的样本
Ytrain = fillc[fillc.notnull()]
# 测试集即当前特征具有缺失值需要进行填充的样本
Ytest = fillc[fillc.isnull()]
# 根据训练集选取的样本索引,提取对应的新特征训练特征矩阵
Xtrain = df_0[Ytrain.index, :]
# 根据测试集选取的样本索引,提取对应的新测试特征矩阵
Xtest = df_0[Ytest.index, :]
rfc = RandomForestRegressor(n_estimators=100)
rfc = rfc.fit(Xtrain, Ytrain)
# 使用RF对标签进行预测
Ypredict = rfc.predict(Xtest)
# 将预测值填到原始缺失数据特征中所有的空值
X_missing_reg.loc[X_missing_reg.iloc[:, i].isnull(), i] = Ypredict
# 对于不同的数据构建不同的回归模型
X = [X_full, X_missing_mean, X_missing_0, X_missing_reg]
mse = []
std = []
for x in X:
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
scores = cross_val_score(estimator, x, y_full, scoring='neg_mean_squared_error',
cv=5).mean()
mse.append(scores * -1)
# 关系对应
[*zip([X_full, X_missing_mean, X_missing_0, X_missing_reg], mse)]
x_labels = ['Full data',
'Mean Imputation',
'Zero Imputation',
'Regressor Imputation']
colors = ['r', 'g', 'b', 'orange']
# 画出画布
plt.figure(figsize=(12, 6))
# 添加子图
ax = plt.subplot(111)
# 绘制横向的条形图
for i in np.arange(len(mse)):
ax.barh(i, mse[i], color=colors[i], alpha=0.6, align='center')
ax.set_title('Imputation Rechiniques with Boston Data')
# 设置x轴标签区间范围
ax.set_xlim(left=np.mean(mse) * 0.9,
right=np.max(mse) * 1.1)
ax.set_yticks(np.arange(len(mse)))
ax.set_xlabel('MSE')
# 把y轴命名好
ax.invert_yaxis()
ax.set_yticklabels(x_labels)
plt.show()