import numpy as np
from numpy import *
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
boston = load_boston()
boston.data.shape
(506, 13)
boston.target.shape
(506,)
import warnings
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed
class myrf:
# 存放树的列表
trees = []
# 随机种子
random_state = 0
# 树的个数
n_estimators = 10
# 最大特征数
max_features = 10
# 最大深度
max_depth = 10
# 切分新节点所需的最小阈值
min_change = 0.001
# 当前树的数量
cur_tree = 0
# 最小分割
min_samples_split = 0
# 叶子内节点的最小数目
min_samples_leaf = 0
# 每次建树时所用的样本占总样本的比例
sample_radio = 0.9
# 每次建树时所并行化处理器的个数
n_jobs = 10
# 计算y的方差
# 本来是要除总样本数的,考虑到对于所有的叶子来说,总样本数都是一致的,所以不除应该也可以。
def get_varience(self, dataSet):
return np.var(dataSet[:,-1])*shape(dataSet)[0]
# 计算y的均值
def get_mean(self,dataSet):
return np.mean(dataSet[:,-1])
# 根据特征边界划分样本
def SplitDataSet(self, dataSet,feature,value):
dataSet = dataSet[dataSet[:,feature].argsort()]
for i in range(shape(dataSet)[0]):
if dataSet[i][feature] == value and dataSet[i+1][feature] != value:
return dataSet[i+1:, :], dataSet[0:i+1, :]
# 选取特征边界
def select_best_feature(self, dataSet):
#计算特征的数目
feature_num=dataSet.shape[1]-1
features=np.random.choice(feature_num,self.max_features,replace=False)
# 最好分数
bestS=inf;
# 最优特征
bestfeature=0;
# 最优特征的分割值
bestValue=0;
S=self.get_varience(dataSet)
# 判断样本数量是否足够
if shape(dataSet)[0] < self.min_samples_split or shape(dataSet)[0] < self.min_samples_leaf:
return None,self.get_mean(dataSet)
for feature in features:
dataSet = dataSet[dataSet[:,feature].argsort()]
# 控制叶子节点数目
for index in range(shape(dataSet)[0]-1):
# 排除重复值
if index != shape(dataSet)[0]-1 and dataSet[index][feature] == dataSet[index+1][feature]:
continue
data0 = dataSet[0:index+1, :]
data1 = dataSet[index+1:, :]
if shape(data0)[0] < self.min_samples_leaf or shape(data1)[0] < self.min_samples_leaf:
continue;
newS=self.get_varience(data0)+self.get_varience(data1)
if bestS>newS:
bestfeature=feature
bestValue=dataSet[index][feature]
# print(bestfeature, bestValue)
bestS=newS
if (S-bestS)<self.min_change: #如果误差不大就退出,说明无法分割
return None,self.get_mean(dataSet)
# print(bestfeature, bestValue)
return bestfeature,bestValue
# 搭建单颗决策树
def createTree(self, dataSet, max_level, flag = 0):
if flag == 0:
seqtree = self.cur_tree+1
self.cur_tree = seqtree;
print('正在搭建第',seqtree,'棵树...')
bestfeature,bestValue=self.select_best_feature(dataSet)
if bestfeature==None:
if flag == 0:
print('第',seqtree,'棵树搭建完成!')
return bestValue
retTree={}
max_level-=1
if max_level<0: #控制深度
return self.get_mean(dataSet)
retTree['bestFeature']=bestfeature
retTree['bestVal']=bestValue
# 分割成左右两棵树
lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
retTree['right']=self.createTree(rSet,self.max_depth,1)
retTree['left']=self.createTree(lSet,self.max_depth,1)
if flag == 0:
print('第',seqtree,'棵树搭建完成!')
return retTree
# 搭建决策树
def createTree(self, dataSet, max_level, flag = 0):
if flag == 0:
seqtree = self.cur_tree+1
self.cur_tree = seqtree;
print('正在搭建第'+str(seqtree)+'棵树...\n')
bestfeature,bestValue=self.select_best_feature(dataSet)
if bestfeature==None:
if flag == 0:
print('第'+str(seqtree)+'棵树搭建完成!')
return bestValue
retTree={}
max_level-=1
if max_level<0: #控制深度
return self.get_mean(dataSet)
retTree['bestFeature']=bestfeature
retTree['bestVal']=bestValue
# 分割成左右两棵树
lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
retTree['right']=self.createTree(rSet,self.max_depth,1)
retTree['left']=self.createTree(lSet,self.max_depth,1)
if flag == 0:
print('第'+str(seqtree)+'棵树搭建完成!')
return retTree
# 初始化随机森林
def __init__(self, random_state, n_estimators, max_features, max_depth, min_change = 0.001,
min_samples_split = 0, min_samples_leaf = 0, sample_radio = 0.9, n_jobs = 10):
self.trees = []
self.random_state = random_state
np.random.seed(self.random_state)
self.n_estimators = n_estimators
self.max_features = max_features
self.max_depth = max_depth
self.min_change = min_change
self.min_samples_leaf = min_samples_leaf
self.min_samples_split = min_samples_split
self.sample_radio = sample_radio
self.n_jobs = n_jobs
# 向森林添加单棵决策树
def get_one_tree(self, dataSet):
X_train, X_test, y_train, y_test = train_test_split(dataSet[:,:-1], dataSet[:,-1],
train_size = self.sample_radio, random_state = self.random_state)
X_train=np.concatenate((X_train,y_train.reshape((-1,1))),axis=1)
self.trees.append(self.createTree(X_train,self.max_depth))
# 并行化搭建随机森林
def fit(self, X, Y): #树的个数,预测时使用的特征的数目,树的深度
dataSet = np.concatenate((X, Y.reshape(-1,1)), axis = -1)
Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.get_one_tree)(dataSet) for _ in range(self.n_estimators))
#预测单个数据样本
def treeForecast(self,tree,data):
if not isinstance(tree,dict):
return float(tree)
if data[tree['bestFeature']]>tree['bestVal']:
if type(tree['left'])=='float':
return tree['left']
else:
return self.treeForecast(tree['left'],data)
else:
if type(tree['right'])=='float':
return tree['right']
else:
return self.treeForecast(tree['right'],data)
# 单决策树预测结果
def createForeCast(self,tree,dataSet):
seqtree = self.cur_tree+1
self.cur_tree = seqtree;
print('第'+str(seqtree)+'棵树正在预测...\n')
l=len(dataSet)
predict=np.mat(zeros((l,1)))
for i in range(l):
predict[i,0]=self.treeForecast(tree,dataSet[i,:])
print('第'+str(seqtree)+'棵树预测完成!')
return predict
# 更新预测值函数
def unpdate_predict(self, predict, tree, X):
predict+=self.createForeCast(tree,X)
# 随机森林预测结果
def predict(self,X):
self.cur_tree = 0;
l=len(X)
predict=np.mat(zeros((l,1)))
Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.unpdate_predict)(predict, tree, X) for tree in self.trees)
# 对多棵树预测的结果取平均
predict/=self.n_estimators
return predict
# 获取模型分数
def get_score(self,target, X):
return r2_score(target, self.predict(X))
# rf2 = mycache(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=10)
rf1 = myrf(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=-1)
rf1.fit(boston.data, boston.target)
正在搭建第1棵树...
正在搭建第2棵树...
正在搭建第3棵树...
正在搭建第4棵树...
第4棵树搭建完成!
正在搭建第5棵树...
第2棵树搭建完成!第3棵树搭建完成!
正在搭建第6棵树...
正在搭建第7棵树...
第1棵树搭建完成!
正在搭建第8棵树...
第6棵树搭建完成!
正在搭建第9棵树...
第5棵树搭建完成!第7棵树搭建完成!
第8棵树搭建完成!
正在搭建第10棵树...
第9棵树搭建完成!
第10棵树搭建完成!
rf1.get_score(boston.target, boston.data)
第1棵树正在预测...
第2棵树正在预测...
第3棵树正在预测...
第1棵树预测完成!
第4棵树正在预测...
第3棵树预测完成!第5棵树正在预测...
第6棵树正在预测...
第6棵树预测完成!第2棵树预测完成!
第7棵树正在预测...
第7棵树预测完成!
第8棵树正在预测...
第8棵树预测完成!
第5棵树预测完成!
第9棵树正在预测...
第4棵树预测完成!第9棵树预测完成!
第10棵树正在预测...
第10棵树预测完成!
0.9302502640348399