【机器学习】【随机森林-2】Random Forest算法的Python实现

1.随机森林基本原理

随机森林的基本原理,以及数学示例,可以看以前博客:【机器学习】【随机森林-1】Random Forest算法讲解 + 示例展示数学求解过程

2.Python实现代码

随机森林可以自己实现,下面是一个在GitHub上找到的一个随机森林算法的实现代码,可以看看。

2.1代码

2.1.1RandomForest.h

/************************************************
*Random Forest Program
*Function:		trian&test Random Forest Model
*Author:		[email protected]
*CreateTime:	2014.7.10
*Version:		V0.1
*************************************************/
#ifndef RANDOM_FOREST_H
#define RANDOM_FOREST_H

#include
#include
#include
#include
#include"Tree.h"
#include"Sample.h"

class RandomForest
{
public:
	/*************************************************************
	*treeNum:	the number of trees in this forest
	*maxDepth:	the max Depth of one single tree
	*minLeafSample:terminate criterion,the min samples in a leaf              
	*minInfoGain:terminate criterion,the min information
	*            gain in a node if it can be splitted
	**************************************************************/
	RandomForest(int treeNum,int maxDepth,int minLeafSample,float minInfoGain);
	RandomForest(const char*modelPath);
	~RandomForest();
	/*************************************************************
	*trainset:	the trainset,every row is a sample,every column is 
	*a feature,the total size is SampleNum*featureNum
	*labels:the labels or regression values of the trainset,
	*the total size is SampleNum
	*SampleNum:the total number of trainset
	*featureNum:the number of features
	*classNum:the class number,regressiong is 1
	*isRegression:if the problem is regression(true) or classification(false)
	*trainFeatureNumPerNode:the feature number used in every node while training
	*************************************************/
	void train(float**trainset,float*labels,int SampleNum,int featureNum,
			   int classNum,bool isRegression,int trainFeatureNumPerNode);
	void train(float**trainset,float*labels,int SampleNum,int featureNum,
			   int classNum,bool isRegression);
	/************************************************
	*sample: a single sample
	*response: the predict result
	*************************************************/
	void predict(float*sample,float&response);
	/************************************************
	*testset: the test set
	*SampleNum:the sample number in the testset
	*responses: the predict results
	*************************************************/
	void predict(float**testset,int SampleNum,float*responses);
	/************************************************
	*path: the path to save the model
	*************************************************/
	void saveModel(const char*path);
	/************************************************
	*path: the path to read the model
	*************************************************/
	void readModel(const char*path);
private:
	int _trainSampleNum;  //the total training sample number
	int _testSampleNum;  //the total testing sample number
	int _featureNum;  //the feature dimension 
	int _trainFeatureNumPerNode;  //the feature number used in a node while training
	int _treeNum;  //the number of trees
	int _maxDepth;  //the max depth which a tree can reach
	int _classNum;  //the number of classes(if regresssion,set it to 1)
	bool _isRegression;  //if it is a regression problem
	int _minLeafSample;  //terminate condition:the min samples in a node
	float _minInfoGain;  //terminate condition:the min information gain in a node
	Tree**_forest;//to store every tree(classification tree or regression tree)
	Sample*_trainSample;  //hold the whole trainset and some other infomation
};

#endif//RANDOM_FOREST_H

2.1.2RandomForest.cpp

#include"RandomForest.h"

RandomForest::RandomForest(int treeNum,int maxDepth,int minLeafSample,float minInfoGain)
{
	_treeNum=treeNum;
	_maxDepth=maxDepth;
	_minLeafSample=minLeafSample;
	_minInfoGain=minInfoGain;
	_trainSample=NULL;
	printf("total tree number:%d\n",_treeNum);
	printf("max depth of a single tree:%d\n",_maxDepth);
	printf("the minimum samples in a leaf:%d\n",_minLeafSample);
	printf("the minimum information gain:%f\n",_minInfoGain);
    
	_forest=new Tree*[_treeNum];
	for(int i=0;i<_treeNum;++i)
	{_forest[i]=NULL;}
}

RandomForest::RandomForest(const char*modelPath)
{
	readModel(modelPath);
}

RandomForest::~RandomForest()
{
	//printf("destroy RandomForest...\n");
	if(_forest!=NULL)
	{
		for(int i=0;i<_treeNum;++i)
		{
			if(_forest[i]!=NULL)
			{
				delete _forest[i];
				_forest[i]=NULL;
			}
		}
		delete[] _forest;
		_forest=NULL;
	}
	if(_trainSample!=NULL)
	{
		delete _trainSample;
		_trainSample=NULL;
	}
}

void RandomForest::train(float**trainset,float*labels,int SampleNum,int featureNum,
			   int classNum,bool isRegression)
{
	int trainFeatureNumPerNode=static_cast(sqrt(static_cast(featureNum)));
	train(trainset,labels,SampleNum,featureNum,classNum,isRegression,trainFeatureNumPerNode);
}

void RandomForest::train(float**trainset,float*labels,int SampleNum,int featureNum,
			   int classNum,bool isRegression,int trainFeatureNumPerNode)
{
	if(_treeNum<1)
	{
		printf("total tree number must bigger than 0!\n");
		printf("training failed\n");
		return;
	}
	if(_maxDepth<1)
	{
		printf("the max depth must bigger than 0!\n");
		printf("training failed\n");
		return;
	}
	if(_minLeafSample<2)
	{
		printf("the minimum samples in a leaf must bigger than 1!\n");
		printf("training failed\n");
		return;
	}
	_trainSampleNum=SampleNum;
	_featureNum=featureNum;
	_classNum=classNum;
	_trainFeatureNumPerNode=trainFeatureNumPerNode;
	_isRegression=isRegression;
	//initialize every tree
	if(_isRegression)
	{
		_classNum=1;
		for(int i=0;i<_treeNum;++i)
		{
			_forest[i]=new RegrTree(_maxDepth,_trainFeatureNumPerNode,
				_minLeafSample,_minInfoGain,_isRegression);
		}
	}
	else
	{
		for(int i=0;i<_treeNum;++i)
		{
			_forest[i]=new ClasTree(_maxDepth,_trainFeatureNumPerNode,
				_minLeafSample,_minInfoGain,_isRegression);
		}
	}
	//this object hold the whole trainset&labels
	_trainSample=new Sample(trainset,labels,_classNum,_trainSampleNum,_featureNum);
	srand(static_cast(time(NULL)));
	int*_sampleIndex=new int[_trainSampleNum];
	//start to train every tree in the forest
	for(int i=0;i<_treeNum;++i)
	{
		printf("train the %d th tree...\n",i);
		//random sampling from trainset
		Sample*sample=new Sample(_trainSample);
		sample->randomSelectSample(_sampleIndex,_trainSampleNum,_trainSampleNum);
		_forest[i]->train(sample);
        delete sample;
	}
	delete[] _sampleIndex;
	_sampleIndex=NULL;
}

void RandomForest::predict(float*data,float&response)
{
	//get the predict from every tree
	//if regression,_classNum=1
	float*result=new float[_classNum];
	int i=0;
	for(i=0;i<_classNum;++i)
	{result[i]=0;}
	for(i=0;i<_treeNum;++i)//_treeNum
	{
		Result r;
		r.label=0;
		r.prob=0;//Result 
		r=_forest[i]->predict(data);
		result[static_cast(r.label)]+=r.prob;
	}
	if(_isRegression)
	{response=result[0]/_treeNum;}
	else
	{
		float maxProbLabel=0;
		float maxProb=result[0];
		for(i=1;i<_classNum;++i)
		{
			if(result[i]>maxProb)
			{
				maxProbLabel=i;
				maxProb=result[i];
			}
		}
		response=maxProbLabel;
	}
	delete[] result;
}

void RandomForest::predict(float**testset,int SampleNum,float*responses)
{
	//get the predict from every tree
	for(int i=0;i(pow(2.0,_maxDepth)-1);
	int isLeaf=0;
	for(int i=0;i<_treeNum;++i)
	{
		Node**arr=_forest[i]->getTreeArray();
		isLeaf=0;
		for(int j=0;jisLeaf())
				{
					isLeaf=1;
					fwrite(&isLeaf,sizeof(int),1,saveFile);
					if(_isRegression)
					{
						float value=((RegrNode*)arr[j])->getValue();
						fwrite(&value,sizeof(float),1,saveFile);
					}
					else
					{
						float clas=((ClasNode*)arr[j])->getClass();
						float prob=((ClasNode*)arr[j])->getProb();
						fwrite(&clas,sizeof(float),1,saveFile);
						fwrite(&prob,sizeof(float),1,saveFile);
					}
				}
				else
				{
					isLeaf=0;
					fwrite(&isLeaf,sizeof(int),1,saveFile);
					int featureIndex=arr[j]->getFeatureIndex();
					float threshold=arr[j]->getThreshold();
					fwrite(&featureIndex,sizeof(int),1,saveFile);
					fwrite(&threshold,sizeof(float),1,saveFile);
				}
			}
		}
		write an numb node to denote the tree end
		//isLeaf=-1;
		//fwrite(&isLeaf,sizeof(int),1,saveFile);
	}
	fclose(saveFile);
}

void RandomForest::readModel(const char*path)
{
	_minLeafSample=0;
	_minInfoGain=0;
	_trainFeatureNumPerNode=0;
	FILE* modelFile=fopen(path,"rb");
	fread(&_treeNum,sizeof(int),1,modelFile);
	fread(&_maxDepth,sizeof(int),1,modelFile);
	fread(&_classNum,sizeof(int),1,modelFile);
	fread(&_isRegression,sizeof(bool),1,modelFile);
	int nodeNum=static_cast(pow(2.0,_maxDepth)-1);
	_trainSample=NULL;
	printf("total tree number:%d\n",_treeNum);
	printf("max depth of a single tree:%d\n",_maxDepth);
	printf("_classNum:%d\n",_classNum);
	printf("_isRegression:%d\n",_isRegression);
	_forest=new Tree*[_treeNum];
	//initialize every tree
	if(_isRegression)
	{
		for(int i=0;i<_treeNum;++i)
		{
			_forest[i]=new RegrTree(_maxDepth,_trainFeatureNumPerNode,
			_minLeafSample,_minInfoGain,_isRegression);
		}
	}
	else
	{
		for(int i=0;i<_treeNum;++i)
		{
			_forest[i]=new ClasTree(_maxDepth,_trainFeatureNumPerNode,
				_minLeafSample,_minInfoGain,_isRegression);
		}
	}
	int*nodeTable=new int[nodeNum];
	int isLeaf=-1;
	int featureIndex=0;
	float threshold=0;
	float value=0;
	float clas=0;
	float prob=0;
	for(int i=0;i<_treeNum;++i)
	{
		memset(nodeTable,0,sizeof(int)*nodeNum);
		nodeTable[0]=1;
		for(int j=0;jcreateNode(j,featureIndex,threshold);
			}
			else if(isLeaf==1)  //leaf
			{
				if(_isRegression)
				{
					fread(&value,sizeof(float),1,modelFile);
					((RegrTree*)_forest[i])->createLeaf(j,value);
				}
				else
				{
					fread(&clas,sizeof(float),1,modelFile);
					fread(&prob,sizeof(float),1,modelFile);
					((ClasTree*)_forest[i])->createLeaf(j,clas,prob);
				}
			}
		}
		//fread(&isLeaf,sizeof(int),1,modelFile);
	}
	fclose(modelFile);
	delete[] nodeTable;
}

整个工程的代码可以详见GitHub:

https://github.com/handspeaker/RandomForests

(end)

你可能感兴趣的:(人工智能,机器学习,跟我一起学机器学习,Machine,Learning)