为了进行决策树与随机森林的简单实现与对比,使用一组分类和回归数据集。
from sklearn import datasets
# Get the data loader
loader_classification = [
("Iris", datasets.load_iris),
("Digits", datasets.load_digits),
("Wine", datasets.load_wine),
("Breast Cancer", datasets.load_breast_cancer),
]
loader_regression = [
("Boston", datasets.load_boston),
("Diabetes", datasets.load_diabetes),
("Linnerud", datasets.load_linnerud),
]
交叉验证的折数为10,分类对比accuracy,回归对比使用mean squared error,随机森林子数据集数为20。
结果如下,代码附后:
-- Dataset: [Iris]
Accuracy (%)
Train Test
Decision Tree 100.00 96.00
Random Forest 99.78 96.00
-- Dataset: [Digits]
Accuracy (%)
Train Test
Decision Tree 100.00 82.86
Random Forest 99.99 93.99
-- Dataset: [Wine]
Accuracy (%)
Train Test
Decision Tree 100.00 87.06
Random Forest 100.00 96.08
-- Dataset: [Breast Cancer]
Accuracy (%)
Train Test
Decision Tree 100.00 91.22
Random Forest 99.94 95.44
-- Dataset: [Boston]
RMSE
Train Test
Decision Tree 0.00 5.97
Random Forest 1.33 4.75
-- Dataset: [Diabetes]
RMSE
Train Test
Decision Tree 0.00 83.39
Random Forest 23.49 59.26
-- Dataset: [Linnerud]
RMSE
Train Test
Decision Tree 0.00 19.98
Random Forest 6.48 18.42
结论:单个决策树在训练集上能实现完全准确或者0均方根误差,这是随机森林无法做到的,但是在测试集上的表现随机森林比决策树更好。通过随机森林避免过拟合,实现更好的泛化性能。
实现过程:
基于数据集对分类进行十折交叉验证,返回accuracy,随机森林子数据集个数选为20。
def cross_val_dt_rt(load_fun) -> (dict, dict):
"""Conducts a 10-fold cross validation for a decision tree and random forrest and
returns their accuracy scores given an sklearn discrete dataset loader function."""
#Load Data
X,y = load_fun(return_X_y=True)
#Cross-validate Decision Tree and Random Forest
dt_scores=cross_validate(
estimator=DecisionTreeClassifier(),
X=X,
y=y,
cv=10,
scoring="accuracy",
return_train_score=True,
)
rf_scores=cross_validate(
estimator=RandomForestClassifier(n_estimators=20),
X=X,
y=y,
cv=10,
scoring="accuracy",
return_train_score=True,
)
return dt_scores,rf_scores
基于数据集对回归进行十折交叉验证,返回均方根误差。
def cross_val_dt_rf_continuous(load_fun) -> (dict, dict):
"""Conducts a 10-fold cross validation for a decision tree and random forrest and
returns their mse scores given an sklearn discrete dataset loader function."""
# Load Data
X, y = load_fun(return_X_y=True)
# Cross-validate Decision Tree and Random Forest
dt_scores = cross_validate(
estimator=DecisionTreeRegressor(),
X=X,
y=y,
cv=10,
scoring=make_scorer(mean_squared_error),
return_train_score=True,
)
rf_scores = cross_validate(
estimator=RandomForestRegressor(n_estimators=20),
X=X,
y=y,
cv=10,
scoring=make_scorer(mean_squared_error),
return_train_score=True,
)
分别将数据集进行对比和输出。
# Set seed for reproducibility
np.random.seed(0)
# For each classification dataset evaluate:
for name, load_fun in loader_classification:
scores_dt, scores_rf = cross_val_dt_rt(load_fun)
# Print results
print(f"-- Dataset: [{name}]")
print(f" \t\tAccuracy (%)")
print(f" \tTrain \tTest")
print(
f"Decision Tree\t{scores_dt['train_score'].mean()*100:.2f}\t\t{scores_dt['test_score'].mean()*100:.2f}"
)
print(
f"Random Forest\t{scores_rf['train_score'].mean()*100:.2f}\t\t{scores_rf['test_score'].mean()*100:.2f}\n\n"
)
# For each regression dataset evaluate:
for name, load_fun in loader_regression:
scores_dt, scores_rf = cross_val_dt_rf_continuous(load_fun)
# Print results
print(f"-- Dataset: [{name}]")
print(f" \t\tRMSE")
print(f" \tTrain \tTest")
print(
f"Decision Tree\t{np.sqrt(scores_dt['train_score'].mean()):.2f}\t\t{np.sqrt(scores_dt['test_score'].mean()):.2f}"
)
print(
f"Random Forest\t{np.sqrt(scores_rf['train_score'].mean()):.2f}\t\t{np.sqrt(scores_rf['test_score'].mean()):.2f}\n\n"
)
调用如下:
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns