手动实现随机森林并做数据实验

获取波士顿房价数据集

import numpy as np
from numpy import *
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
boston = load_boston()
boston.data.shape
(506, 13)
boston.target.shape
(506,)

搭建随机森林

建立随机森林类

import warnings 
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed
class myrf:
    # 存放树的列表
    trees = []
    # 随机种子
    random_state = 0
    # 树的个数
    n_estimators = 10
    # 最大特征数
    max_features = 10
    # 最大深度
    max_depth = 10
    # 切分新节点所需的最小阈值
    min_change = 0.001
    # 当前树的数量
    cur_tree = 0
    # 最小分割
    min_samples_split = 0
    # 叶子内节点的最小数目
    min_samples_leaf = 0
    # 每次建树时所用的样本占总样本的比例
    sample_radio = 0.9
    # 每次建树时所并行化处理器的个数
    n_jobs = 10
    # 计算y的方差
    # 本来是要除总样本数的,考虑到对于所有的叶子来说,总样本数都是一致的,所以不除应该也可以。
    def get_varience(self, dataSet):
        return np.var(dataSet[:,-1])*shape(dataSet)[0]
    
    # 计算y的均值
    def get_mean(self,dataSet):
        return np.mean(dataSet[:,-1])
    
    # 根据特征边界划分样本
    def SplitDataSet(self, dataSet,feature,value):
        dataSet = dataSet[dataSet[:,feature].argsort()]
        for i in range(shape(dataSet)[0]):
            if dataSet[i][feature] == value and dataSet[i+1][feature] != value:
                return dataSet[i+1:, :], dataSet[0:i+1, :]
    
    # 选取特征边界
    def select_best_feature(self, dataSet):
        #计算特征的数目
        feature_num=dataSet.shape[1]-1
        features=np.random.choice(feature_num,self.max_features,replace=False)
        # 最好分数
        bestS=inf;
        # 最优特征
        bestfeature=0;
        # 最优特征的分割值
        bestValue=0;
        S=self.get_varience(dataSet)
        # 判断样本数量是否足够
        if shape(dataSet)[0] < self.min_samples_split or shape(dataSet)[0] < self.min_samples_leaf:
            return None,self.get_mean(dataSet)
        for feature in features:
            dataSet = dataSet[dataSet[:,feature].argsort()]
            # 控制叶子节点数目
            for index in range(shape(dataSet)[0]-1):
                # 排除重复值
                if index != shape(dataSet)[0]-1 and dataSet[index][feature] == dataSet[index+1][feature]:
                    continue
                data0 = dataSet[0:index+1, :]
                data1 = dataSet[index+1:, :]
                if shape(data0)[0] < self.min_samples_leaf or shape(data1)[0] < self.min_samples_leaf:
                    continue;
                newS=self.get_varience(data0)+self.get_varience(data1)
                if bestS>newS:
                    bestfeature=feature
                    bestValue=dataSet[index][feature]
#                     print(bestfeature, bestValue)
                    bestS=newS
        if (S-bestS)<self.min_change: #如果误差不大就退出,说明无法分割
            return None,self.get_mean(dataSet)
#         print(bestfeature, bestValue)
        return bestfeature,bestValue
    
    # 搭建单颗决策树
    def createTree(self, dataSet, max_level, flag = 0):
        if flag == 0:
            seqtree = self.cur_tree+1
            self.cur_tree = seqtree;
            print('正在搭建第',seqtree,'棵树...')
        bestfeature,bestValue=self.select_best_feature(dataSet)
        if bestfeature==None:
            if flag == 0:
                print('第',seqtree,'棵树搭建完成!')
            return bestValue
        retTree={}
        max_level-=1
        if max_level<0:   #控制深度
            return self.get_mean(dataSet)
        retTree['bestFeature']=bestfeature
        retTree['bestVal']=bestValue
        # 分割成左右两棵树
        lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
        retTree['right']=self.createTree(rSet,self.max_depth,1)
        retTree['left']=self.createTree(lSet,self.max_depth,1)
        if flag == 0:
            print('第',seqtree,'棵树搭建完成!')
        return retTree
    # 搭建决策树
    def createTree(self, dataSet, max_level, flag = 0):
        if flag == 0:
            seqtree = self.cur_tree+1
            self.cur_tree = seqtree;
            print('正在搭建第'+str(seqtree)+'棵树...\n')
        bestfeature,bestValue=self.select_best_feature(dataSet)
        if bestfeature==None:
            if flag == 0:
                print('第'+str(seqtree)+'棵树搭建完成!')
            return bestValue
        retTree={}
        max_level-=1
        if max_level<0:   #控制深度
            return self.get_mean(dataSet)
        retTree['bestFeature']=bestfeature
        retTree['bestVal']=bestValue
        # 分割成左右两棵树
        lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
        retTree['right']=self.createTree(rSet,self.max_depth,1)
        retTree['left']=self.createTree(lSet,self.max_depth,1)
        if flag == 0:
            print('第'+str(seqtree)+'棵树搭建完成!')
        return retTree
    
    # 初始化随机森林
    def __init__(self, random_state, n_estimators, max_features, max_depth, min_change = 0.001,
                 min_samples_split = 0, min_samples_leaf = 0, sample_radio = 0.9, n_jobs = 10):
        self.trees = []
        self.random_state = random_state
        np.random.seed(self.random_state)
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_change = min_change
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.sample_radio = sample_radio
        self.n_jobs = n_jobs
        
    # 向森林添加单棵决策树
    def get_one_tree(self, dataSet):
        X_train, X_test, y_train, y_test = train_test_split(dataSet[:,:-1], dataSet[:,-1], 
                                                            train_size = self.sample_radio, random_state = self.random_state)
        X_train=np.concatenate((X_train,y_train.reshape((-1,1))),axis=1)
        self.trees.append(self.createTree(X_train,self.max_depth))
    
    # 并行化搭建随机森林
    def fit(self, X, Y):   #树的个数,预测时使用的特征的数目,树的深度
        dataSet = np.concatenate((X, Y.reshape(-1,1)), axis = -1)
        Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.get_one_tree)(dataSet) for _ in range(self.n_estimators))             
            
    #预测单个数据样本
    def treeForecast(self,tree,data):
        if not isinstance(tree,dict):
            return float(tree)
        if data[tree['bestFeature']]>tree['bestVal']:
            if type(tree['left'])=='float':
                return tree['left']
            else:
                return self.treeForecast(tree['left'],data)
        else:
            if type(tree['right'])=='float':
                return tree['right']
            else:
                return self.treeForecast(tree['right'],data) 
            
    # 单决策树预测结果
    def createForeCast(self,tree,dataSet):
        seqtree = self.cur_tree+1
        self.cur_tree = seqtree;
        print('第'+str(seqtree)+'棵树正在预测...\n')
        l=len(dataSet)
        predict=np.mat(zeros((l,1)))
        for i in range(l):
            predict[i,0]=self.treeForecast(tree,dataSet[i,:])
        print('第'+str(seqtree)+'棵树预测完成!')
        return predict
    
    # 更新预测值函数
    def unpdate_predict(self, predict, tree, X):
        predict+=self.createForeCast(tree,X)
    
    # 随机森林预测结果
    def predict(self,X):
        self.cur_tree = 0;
        l=len(X)
        predict=np.mat(zeros((l,1)))
        Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.unpdate_predict)(predict, tree, X) for tree in self.trees)
    #     对多棵树预测的结果取平均
        predict/=self.n_estimators
        return predict
    
    # 获取模型分数
    def get_score(self,target, X):
        return r2_score(target, self.predict(X))

模型预测与评估

预测模型


# rf2 = mycache(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=10)
rf1 = myrf(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=-1)
rf1.fit(boston.data, boston.target)
正在搭建第1棵树...

正在搭建第2棵树...
正在搭建第3棵树...
正在搭建第4棵树...

第4棵树搭建完成!
正在搭建第5棵树...

第2棵树搭建完成!第3棵树搭建完成!

正在搭建第6棵树...

正在搭建第7棵树...

第1棵树搭建完成!
正在搭建第8棵树...

第6棵树搭建完成!
正在搭建第9棵树...

第5棵树搭建完成!第7棵树搭建完成!

第8棵树搭建完成!
正在搭建第10棵树...

第9棵树搭建完成!
第10棵树搭建完成!
rf1.get_score(boston.target, boston.data)
第1棵树正在预测...
第2棵树正在预测...
第3棵树正在预测...

第1棵树预测完成!
第4棵树正在预测...
第3棵树预测完成!第5棵树正在预测...
第6棵树正在预测...
第6棵树预测完成!第2棵树预测完成!
第7棵树正在预测...
第7棵树预测完成!
第8棵树正在预测...
第8棵树预测完成!
第5棵树预测完成!
第9棵树正在预测...
第4棵树预测完成!第9棵树预测完成!
第10棵树正在预测...
第10棵树预测完成!

0.9302502640348399

你可能感兴趣的:(机器学习)