它本身不是单独的机器学习算法,而是通过在数据上构建多个模型,集成所有模型的建模结果,以此来获取比单个模型更好的回归或分类表现。
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc=rfc.fit(X_train,y_train)
result=rfc.score(X_test,y_train)
参数与决策树中的参数一致,再此不多赘述
基评估器的数量,这个参数对随机森林模型的精确性影响是单调的,n_estimators越大,模型效果往往越好
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
wine=load_wine()
Xtrain,Xtest,Ytrain,Ytest=train_test_split(wine.data,wine.target,test_size=0.3)
clf=DecisionTreeClassifier(random_state=0)
rfc=RandomForestClassifier(random_state=0)
clf=clf.fit(Xtrain,Ytrain)
rfc=rfc.fit(Xtrain,Ytrain)
score_c=clf.score(Xtest,Ytest)
score_r=rfc.score(Xtest,Ytest)
0.8888888888888888
0.9629629629629629
为了更直观的展示两者的差别
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
wine=load_wine()
rfc=RandomForestClassifier(random_state=25)
rfc_s=cross_val_score(rfc,wine.data,wine.target,cv=10)
clf=DecisionTreeClassifier(random_state=0)
clf_s=cross_val_score(clf,wine.data,wine.target,cv=10)
plt.plot(range(1,11),rfc_s,label="RandomForest")
plt.plot(range(1,11),clf_s,label="Decision Tree")
plt.legend()
plt.show()
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
wine=load_wine()
sup=[]
for i in range(200):
rfc=RandomForestClassifier(random_state=25)
rfc_s=cross_val_score(rfc,wine.data,wine.target,cv=10)
sup.append(rfc_s)
print(max(sup),sup.index(max(sup)))
plt.figure(figsize=[20,5])
plt.plot(range(1,201),sup)
plt.show()#运行时间回很长
在机器学习中,有用来描述模型再未知数据上的准确率的指标,叫做泛化误差
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
data = load_breast_cancer()
#简单的建模,看模型本身在数据集上的效果
rfc=RandomForestClassifier(n_estimators=100,random_state=90)
score_pre=cross_val_score(rfc,data.data,data.target,cv=10).mean()
print(score_pre)
0.9648809523809524
#调整n_estimators
scorel=[]
for i in range(0,200,10):
rfc=RandomForestClassifier(n_estimators=i+1,random_state=90)
score=cross_val_score(rfc,data.data,data.target,cv=10).mean()
scorel.append(score)
print(max(scorel),(scorel.index(max(scorel))*10)+1)
plt.figure(figsize=[20,5])
plt.plot(range(1,201,10),scorel)
plt.show()
0.9631265664160402 71