随机森林就是在数据子集上训练处一系列的模型。这些子集是从全训练数据集中随机抽取出来的。一种抽取方法是对数据行的随机放回取样,同自举集成bagging方法一样。另一种方法是每个决策树的训练数据集只是所有特征属性随机抽取的一个子集,并非全部的属性。
集成应用本身有个基学习器,那么这里采用的基学习器则用决策树来近似随机森林。
__author__ = 'mike-bowles'
import numpy
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
import random
from math import sqrt
import matplotlib.pyplot as plot
import csv
import random
#读取数据
file='winequality-red.csv'
xList = []
labels = []
names = []
firstLine = True
with open(file,'r') as f:
for line in f.readlines():
if firstLine:
names = line.strip().split(";")
## print(names)
firstLine = False
else:
#split on semi-colon
row = line.strip().split(";")
## print(row)
#put labels in separate array
labels.append(float(row[-1]))
#remove label from row
row.pop()
#convert row to floats
floatRow = [float(num) for num in row]
xList.append(floatRow)
nrows = len(xList)
ncols = len(xList[0])
#take fixed test set 30% of sample
random.seed(1) #set seed so results are the same each run
nSample = int(nrows * 0.30)
idxTest = random.sample(range(nrows), nSample)#随机划分训练测试数据
idxTest.sort()
idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
#Define test and training attribute and label sets
xTrain = [xList[r] for r in idxTrain]
xTest = [xList[r] for r in idxTest]
yTrain = [labels[r] for r in idxTrain]
yTest = [labels[r] for r in idxTest]
#train a series of models on random subsets of the training data
#collect the models in a list and check error of composite as list grows
#maximum number of models to generate
numTreesMax = 30
#tree depth - typically at the high end
treeDepth = 12
#pick how many attributes will be used in each model.
# authors recommend 1/3 for regression problem
nAttr = 4
#initialize a list to hold models
modelList = []
indexList = []
predList = []
nTrainRows = len(yTrain)
for iTrees in range(numTreesMax):
modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
#take random sample of attributes
idxAttr = random.sample(range(ncols), nAttr)#对属性的随机采样
idxAttr.sort()
indexList.append(idxAttr)
#take a random sample of training rows
idxRows = []
for i in range(int(0.5 * nTrainRows)):
idxRows.append(random.choice(range(len(xTrain))))
idxRows.sort()
#build training set
xRfTrain = []
yRfTrain = []
for i in range(len(idxRows)):
temp = [xTrain[idxRows[i]][j] for j in idxAttr]
xRfTrain.append(temp)
yRfTrain.append(yTrain[idxRows[i]])
modelList[-1].fit(xRfTrain, yRfTrain)
#restrict xTest to attributes selected for training
xRfTest = []
for xx in xTest:
temp = [xx[i] for i in idxAttr]
xRfTest.append(temp)
latestOutSamplePrediction = modelList[-1].predict(xRfTest)
predList.append(list(latestOutSamplePrediction))
#build cumulative prediction from first "n" models
mse = []
allPredictions = []
for iModels in range(len(modelList)):
#add the first "iModels" of the predictions and multiply by eps
prediction = []
for iPred in range(len(xTest)):
prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) / (iModels + 1))
allPredictions.append(prediction)
errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
mse.append(sum([e * e for e in errors]) / len(yTest))
nModels = [i + 1 for i in range(len(modelList))]
plot.plot(nModels,mse)
plot.axis('tight')
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Mean Squared Error')
plot.ylim((0.0, max(mse)))
plot.show()
print('Minimum MSE')
print(min(mse))
#printed output
#Depth 1
#Minimum MSE
#0.52666715461
#Depth 5
#Minimum MSE
#0.426116327584
#Depth 12
#Minimum MSE
#0.38508387863
注意: 对于单一属性的例子是不能用于随机森林的,因为那个例子只有一个属性,对于单一属性进行随机选取显然没有任何意义。同时随机森林算法与Bagging算法代码很像,只是多了个在iTree循环时之前,制定了一个变量nAttr表示要随机选择几个属性(nAttr的取值:在回归问题上nAttr为全部属性的1/3,分类问题nAttr为全部属性数目的平方根)。在iTree循环内部,有对属性矩阵行的取样(这与Bagging形同)。还有一个对属性矩阵的列的随机不放回取样,然后训练决策树,测试数据。
那么说了这么多,随机森林是两个方法的结合,包括Bagging方法和属性随机选择方法。属性随机选择方法实际上是对二元决策树基学习器的修正。这些差异看起来不是本质上的,但是这些给予了随机森林与Bagging和梯度提升法不同的性能特性。有研究结果表明随机森林更适合于广泛稀疏的属性空间,如文本挖掘问题。与梯度提升法相比,随机森林更易于并行化,因为每个基学习器都可以单独训练。然而梯度提升法不行,因为每个基学习器都依赖于前一个基学习器的结果。
在Python集成包里面有直接调用随机森林函数。
sklearn.ensemble.RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None,min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07,bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0,warm_start=False, class_weight=None)
fit方法用于训练,predict用于预测。下面仍然针对同一样本数据,进行随机森林处理
import numpy
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
import pylab as plot
import csv
import random
#读取数据
file='winequality-red.csv'
xList = []
labels = []
names = []
firstLine = True
with open(file,'r') as f:
for line in f.readlines():
if firstLine:
names = line.strip().split(";")
## print(names)
firstLine = False
else:
#split on semi-colon
row = line.strip().split(";")
## print(row)
#put labels in separate array
labels.append(float(row[-1]))
#remove label from row
row.pop()
#convert row to floats
floatRow = [float(num) for num in row]
xList.append(floatRow)
nrows = len(xList)
ncols = len(xList[0])
X = numpy.array(xList)
y = numpy.array(labels)
wineNames = numpy.array(names)
#take fixed holdout set 30% of data rows
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
#train random forest at a range of ensemble sizes in order to see how the mse changes
mseOos = []
nTreeList = range(50, 500, 10)
for iTrees in nTreeList:
depth = None
maxFeat = 4 #try tweaking
wineRFModel = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,oob_score=False, random_state=531)
wineRFModel.fit(xTrain,yTrain)
#Accumulate mse on test set
prediction = wineRFModel.predict(xTest)
mseOos.append(mean_squared_error(yTest, prediction))#计算测试数据的误差
print("MSE" )
print(mseOos[-1])
#plot training and test errors vs number of trees in ensemble
plot.plot(nTreeList, mseOos)
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Mean Squared Error')
#plot.ylim([0.0, 1.1*max(mseOob)])
plot.show()
# Plot feature importance
featureImportance = wineRFModel.feature_importances_ #哪个属性最重要
# normalize by max importance
featureImportance = featureImportance / featureImportance.max()
sorted_idx = numpy.argsort(featureImportance)
barPos = numpy.arange(sorted_idx.shape[0]) + .5
plot.barh(barPos, featureImportance[sorted_idx], align='center')
plot.yticks(barPos, wineNames[sorted_idx])
plot.xlabel('Variable Importance')
plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
plot.show()
#printed output
#MSE
#0.314125711509
相应的也有随机森林用于分类问题。集成包有相应的方法RandomForestClassifier用于二分类或多分类的问题。待续研究。