随机森林的基本原理,以及数学示例,可以看以前博客:【机器学习】【随机森林-1】Random Forest算法讲解 + 示例展示数学求解过程
随机森林可以自己实现,下面是一个在GitHub上找到的一个随机森林算法的实现代码,可以看看。
/************************************************
*Random Forest Program
*Function: trian&test Random Forest Model
*Author: [email protected]
*CreateTime: 2014.7.10
*Version: V0.1
*************************************************/
#ifndef RANDOM_FOREST_H
#define RANDOM_FOREST_H
#include
#include
#include
#include
#include"Tree.h"
#include"Sample.h"
class RandomForest
{
public:
/*************************************************************
*treeNum: the number of trees in this forest
*maxDepth: the max Depth of one single tree
*minLeafSample:terminate criterion,the min samples in a leaf
*minInfoGain:terminate criterion,the min information
* gain in a node if it can be splitted
**************************************************************/
RandomForest(int treeNum,int maxDepth,int minLeafSample,float minInfoGain);
RandomForest(const char*modelPath);
~RandomForest();
/*************************************************************
*trainset: the trainset,every row is a sample,every column is
*a feature,the total size is SampleNum*featureNum
*labels:the labels or regression values of the trainset,
*the total size is SampleNum
*SampleNum:the total number of trainset
*featureNum:the number of features
*classNum:the class number,regressiong is 1
*isRegression:if the problem is regression(true) or classification(false)
*trainFeatureNumPerNode:the feature number used in every node while training
*************************************************/
void train(float**trainset,float*labels,int SampleNum,int featureNum,
int classNum,bool isRegression,int trainFeatureNumPerNode);
void train(float**trainset,float*labels,int SampleNum,int featureNum,
int classNum,bool isRegression);
/************************************************
*sample: a single sample
*response: the predict result
*************************************************/
void predict(float*sample,float&response);
/************************************************
*testset: the test set
*SampleNum:the sample number in the testset
*responses: the predict results
*************************************************/
void predict(float**testset,int SampleNum,float*responses);
/************************************************
*path: the path to save the model
*************************************************/
void saveModel(const char*path);
/************************************************
*path: the path to read the model
*************************************************/
void readModel(const char*path);
private:
int _trainSampleNum; //the total training sample number
int _testSampleNum; //the total testing sample number
int _featureNum; //the feature dimension
int _trainFeatureNumPerNode; //the feature number used in a node while training
int _treeNum; //the number of trees
int _maxDepth; //the max depth which a tree can reach
int _classNum; //the number of classes(if regresssion,set it to 1)
bool _isRegression; //if it is a regression problem
int _minLeafSample; //terminate condition:the min samples in a node
float _minInfoGain; //terminate condition:the min information gain in a node
Tree**_forest;//to store every tree(classification tree or regression tree)
Sample*_trainSample; //hold the whole trainset and some other infomation
};
#endif//RANDOM_FOREST_H
#include"RandomForest.h"
RandomForest::RandomForest(int treeNum,int maxDepth,int minLeafSample,float minInfoGain)
{
_treeNum=treeNum;
_maxDepth=maxDepth;
_minLeafSample=minLeafSample;
_minInfoGain=minInfoGain;
_trainSample=NULL;
printf("total tree number:%d\n",_treeNum);
printf("max depth of a single tree:%d\n",_maxDepth);
printf("the minimum samples in a leaf:%d\n",_minLeafSample);
printf("the minimum information gain:%f\n",_minInfoGain);
_forest=new Tree*[_treeNum];
for(int i=0;i<_treeNum;++i)
{_forest[i]=NULL;}
}
RandomForest::RandomForest(const char*modelPath)
{
readModel(modelPath);
}
RandomForest::~RandomForest()
{
//printf("destroy RandomForest...\n");
if(_forest!=NULL)
{
for(int i=0;i<_treeNum;++i)
{
if(_forest[i]!=NULL)
{
delete _forest[i];
_forest[i]=NULL;
}
}
delete[] _forest;
_forest=NULL;
}
if(_trainSample!=NULL)
{
delete _trainSample;
_trainSample=NULL;
}
}
void RandomForest::train(float**trainset,float*labels,int SampleNum,int featureNum,
int classNum,bool isRegression)
{
int trainFeatureNumPerNode=static_cast(sqrt(static_cast(featureNum)));
train(trainset,labels,SampleNum,featureNum,classNum,isRegression,trainFeatureNumPerNode);
}
void RandomForest::train(float**trainset,float*labels,int SampleNum,int featureNum,
int classNum,bool isRegression,int trainFeatureNumPerNode)
{
if(_treeNum<1)
{
printf("total tree number must bigger than 0!\n");
printf("training failed\n");
return;
}
if(_maxDepth<1)
{
printf("the max depth must bigger than 0!\n");
printf("training failed\n");
return;
}
if(_minLeafSample<2)
{
printf("the minimum samples in a leaf must bigger than 1!\n");
printf("training failed\n");
return;
}
_trainSampleNum=SampleNum;
_featureNum=featureNum;
_classNum=classNum;
_trainFeatureNumPerNode=trainFeatureNumPerNode;
_isRegression=isRegression;
//initialize every tree
if(_isRegression)
{
_classNum=1;
for(int i=0;i<_treeNum;++i)
{
_forest[i]=new RegrTree(_maxDepth,_trainFeatureNumPerNode,
_minLeafSample,_minInfoGain,_isRegression);
}
}
else
{
for(int i=0;i<_treeNum;++i)
{
_forest[i]=new ClasTree(_maxDepth,_trainFeatureNumPerNode,
_minLeafSample,_minInfoGain,_isRegression);
}
}
//this object hold the whole trainset&labels
_trainSample=new Sample(trainset,labels,_classNum,_trainSampleNum,_featureNum);
srand(static_cast(time(NULL)));
int*_sampleIndex=new int[_trainSampleNum];
//start to train every tree in the forest
for(int i=0;i<_treeNum;++i)
{
printf("train the %d th tree...\n",i);
//random sampling from trainset
Sample*sample=new Sample(_trainSample);
sample->randomSelectSample(_sampleIndex,_trainSampleNum,_trainSampleNum);
_forest[i]->train(sample);
delete sample;
}
delete[] _sampleIndex;
_sampleIndex=NULL;
}
void RandomForest::predict(float*data,float&response)
{
//get the predict from every tree
//if regression,_classNum=1
float*result=new float[_classNum];
int i=0;
for(i=0;i<_classNum;++i)
{result[i]=0;}
for(i=0;i<_treeNum;++i)//_treeNum
{
Result r;
r.label=0;
r.prob=0;//Result
r=_forest[i]->predict(data);
result[static_cast(r.label)]+=r.prob;
}
if(_isRegression)
{response=result[0]/_treeNum;}
else
{
float maxProbLabel=0;
float maxProb=result[0];
for(i=1;i<_classNum;++i)
{
if(result[i]>maxProb)
{
maxProbLabel=i;
maxProb=result[i];
}
}
response=maxProbLabel;
}
delete[] result;
}
void RandomForest::predict(float**testset,int SampleNum,float*responses)
{
//get the predict from every tree
for(int i=0;i(pow(2.0,_maxDepth)-1);
int isLeaf=0;
for(int i=0;i<_treeNum;++i)
{
Node**arr=_forest[i]->getTreeArray();
isLeaf=0;
for(int j=0;jisLeaf())
{
isLeaf=1;
fwrite(&isLeaf,sizeof(int),1,saveFile);
if(_isRegression)
{
float value=((RegrNode*)arr[j])->getValue();
fwrite(&value,sizeof(float),1,saveFile);
}
else
{
float clas=((ClasNode*)arr[j])->getClass();
float prob=((ClasNode*)arr[j])->getProb();
fwrite(&clas,sizeof(float),1,saveFile);
fwrite(&prob,sizeof(float),1,saveFile);
}
}
else
{
isLeaf=0;
fwrite(&isLeaf,sizeof(int),1,saveFile);
int featureIndex=arr[j]->getFeatureIndex();
float threshold=arr[j]->getThreshold();
fwrite(&featureIndex,sizeof(int),1,saveFile);
fwrite(&threshold,sizeof(float),1,saveFile);
}
}
}
write an numb node to denote the tree end
//isLeaf=-1;
//fwrite(&isLeaf,sizeof(int),1,saveFile);
}
fclose(saveFile);
}
void RandomForest::readModel(const char*path)
{
_minLeafSample=0;
_minInfoGain=0;
_trainFeatureNumPerNode=0;
FILE* modelFile=fopen(path,"rb");
fread(&_treeNum,sizeof(int),1,modelFile);
fread(&_maxDepth,sizeof(int),1,modelFile);
fread(&_classNum,sizeof(int),1,modelFile);
fread(&_isRegression,sizeof(bool),1,modelFile);
int nodeNum=static_cast(pow(2.0,_maxDepth)-1);
_trainSample=NULL;
printf("total tree number:%d\n",_treeNum);
printf("max depth of a single tree:%d\n",_maxDepth);
printf("_classNum:%d\n",_classNum);
printf("_isRegression:%d\n",_isRegression);
_forest=new Tree*[_treeNum];
//initialize every tree
if(_isRegression)
{
for(int i=0;i<_treeNum;++i)
{
_forest[i]=new RegrTree(_maxDepth,_trainFeatureNumPerNode,
_minLeafSample,_minInfoGain,_isRegression);
}
}
else
{
for(int i=0;i<_treeNum;++i)
{
_forest[i]=new ClasTree(_maxDepth,_trainFeatureNumPerNode,
_minLeafSample,_minInfoGain,_isRegression);
}
}
int*nodeTable=new int[nodeNum];
int isLeaf=-1;
int featureIndex=0;
float threshold=0;
float value=0;
float clas=0;
float prob=0;
for(int i=0;i<_treeNum;++i)
{
memset(nodeTable,0,sizeof(int)*nodeNum);
nodeTable[0]=1;
for(int j=0;jcreateNode(j,featureIndex,threshold);
}
else if(isLeaf==1) //leaf
{
if(_isRegression)
{
fread(&value,sizeof(float),1,modelFile);
((RegrTree*)_forest[i])->createLeaf(j,value);
}
else
{
fread(&clas,sizeof(float),1,modelFile);
fread(&prob,sizeof(float),1,modelFile);
((ClasTree*)_forest[i])->createLeaf(j,clas,prob);
}
}
}
//fread(&isLeaf,sizeof(int),1,modelFile);
}
fclose(modelFile);
delete[] nodeTable;
}
整个工程的代码可以详见GitHub:
https://github.com/handspeaker/RandomForests
(end)