

#include "ml.h"
#include "cv.h"    
#include "highgui.h"    
#include <opencv2/opencv.hpp>
#include <stdio.h>

The sample demonstrates how to train Random Trees classifier
(or Boosting classifier, or MLP - see main()) using the provided dataset.

We use the sample database
from UCI Repository, here is the link:

Newman, D.J. & Hettich, S. & Blake, C.L. & Merz, C.J. (1998).
UCI Repository of machine learning databases
Irvine, CA: University of California, Department of Information and Computer Science.

The dataset consists of 20000 feature vectors along with the
responses - capital latin letters A..Z.
The first 16000 (10000 for boosting)) samples are used for training
and the remaining 4000 (10000 for boosting) - to test the classifier.

// This function reads data and responses from the file <filename>
static int
read_num_class_data( const char* filename, int var_count,
					CvMat** data, CvMat** responses )
	const int M = 1024;
	FILE* f = fopen( filename, "rt" );
	CvMemStorage* storage;     //分配的存储空间
	CvSeq* seq;        //cv的序列,相当于双向链表的数据结构
	char buf[M+2];
	float* el_ptr;    
	CvSeqReader reader;
	int i, j;

	if( !f )
		return 0;

	el_ptr = new float[var_count+1];     
	storage = cvCreateMemStorage();

	seq = cvCreateSeq( 0, sizeof(*seq), (var_count+1)*sizeof(float), storage );  

		char* ptr;
		if( !fgets( buf, M, f ) || !strchr( buf, ',' ) )
		el_ptr[0] = buf[0];			//第一个值赋值,比较特殊,是和数据相关的。
		ptr = buf+2;
		for( i = 1; i <= var_count; i++ )
			int n = 0;
			sscanf( ptr, "%f%n", el_ptr + i, &n );
			ptr += n + 1;
		if( i <= var_count )
		cvSeqPush( seq, el_ptr );

	*data = cvCreateMat( seq->total, var_count, CV_32F ); 
	*responses = cvCreateMat( seq->total, 1, CV_32F );

	cvStartReadSeq( seq, &reader );

	for( i = 0; i < seq->total; i++ )
		const float* sdata = (float*)reader.ptr + 1;
		float* ddata = data[0]->data.fl + var_count*i;
		float* dr = responses[0]->data.fl + i;

		for( j = 0; j < var_count; j++ )
			ddata[j] = sdata[j];
		*dr = sdata[-1];
		CV_NEXT_SEQ_ELEM( seq->elem_size, reader );

	cvReleaseMemStorage( &storage );
	delete el_ptr;
	return 1;

int build_boost_classifier( char* data_filename,
						   char* filename_to_save, char* filename_to_load )
	const int class_count = 26;   //分为26个类别
	CvMat* data = 0;
	CvMat* responses = 0;

	//--var_type  定义responses的类型
	CvMat* var_type = 0;   
	CvMat* temp_sample = 0;   //临时的样本
	CvMat* weak_responses = 0;  //弱分类器的输出

	int ok = read_num_class_data( data_filename, 16, &data, &responses );   //16个特征
	int nsamples_all = 0, ntrain_samples = 0;      //样本总数(/训练样本)的个数
	int var_count;   //特征的个数
	double train_hr = 0, test_hr = 0;  
	CvBoost boost;

	if( !ok )
		printf( "Could not read the database %s\n", data_filename );
		return -1;

	printf( "The database %s is loaded.\n", data_filename );
	nsamples_all = data->rows;
	ntrain_samples = (int)(nsamples_all*0.5);		//样本总数的一半用来训练?
	var_count = data->cols;    //特征的个数

	// 生成或者加载已生成的boost分类器
	if( filename_to_load )
		// load classifier from the specified file
		boost.load( filename_to_load );
		ntrain_samples = 0;
		if( !boost.get_weak_predictors() )
			printf( "Could not read the classifier %s\n", filename_to_load );
			return -1;
		printf( "The classifier %s is loaded.\n", data_filename );
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		// As currently boosted tree classifier in MLL can only be trained
		// for 2-class problems, we transform the training database by
		// "unrolling" each training sample as many times as the number of
		// classes (26) that we have.
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

		/*它是把一个multiple-class classifcation的问题,巧妙的变成了binary classification的问题。
		   每一个原来的sample被重复26次,构建出26个sample data; 其中,对于每一个sample data:
		a. 在feature vector中新增了一项feature,这个feature就是 0 - 26 +‘A’ 的 ASCII值;
		    因为字母种类是categorical的value,所以var_type的最后一列是 CV_VAR_CATEGORICAL;
		b. response vector 变成了true/false, true or false取决于当前sample data的字母值
		     是不是对应于raw sample data的response.*/

		CvMat* new_data = cvCreateMat( ntrain_samples*class_count, var_count + 1, CV_32F );
		CvMat* new_responses = cvCreateMat( ntrain_samples*class_count, 1, CV_32S );

		// 1. unroll the database type mask
		printf( "Unrolling the database...\n");
		for( int  i = 0; i < ntrain_samples; i++ )
			float* data_row = (float*)(data->data.ptr + data->step*i);		//每次指向原始数据的一行
			for( int j = 0; j < class_count; j++ )   //小于类型的个数,也就是复制的次数
				float* new_data_row = (float*)(new_data->data.ptr +
					new_data->step*(i*class_count+j));     //找到新分配的数据的行
				for( int k = 0; k < var_count; k++ )     //将每行的特征复制过来
					new_data_row[k] = data_row[k];
				new_data_row[var_count] = (float)j;   //最后一个放了复制的序号
				new_responses->data.i[i*class_count + j] = responses->data.fl[i] == j+'A';  //新的response取决于是否与原来的值相同

		// 2. create type mask
		var_type = cvCreateMat( var_count + 2, 1, CV_8U );  //为啥加2,下面有分析
		cvSet( var_type, cvScalarAll(CV_VAR_ORDERED) );      //这些都是数值型的
		// the last indicator variable, as well
		// as the new (binary) response are categorical
		cvSetReal1D( var_type, var_count, CV_VAR_CATEGORICAL );   //这是新加的一位,如前面所说,也应该是离散类型标签
		cvSetReal1D( var_type, var_count+1, CV_VAR_CATEGORICAL );   //最后一位作为返回的类型

		// 3. train classifier
		printf( "Training the classifier (may take a few minutes)...\n");
		boost.train( new_data, CV_ROW_SAMPLE, new_responses, 0, 0, var_type, 0,
			CvBoostParams(CvBoost::REAL, 100, 0.95, 5, false, 0 ));    
		//CvBoostParams的参数含义分别是:(1)使用REAL adaboost,
		//(3)样本总权值小于1.0-0.95 = 0.05的点将不参加下一次的迭代
		cvReleaseMat( &new_data );
		cvReleaseMat( &new_responses );

	temp_sample = cvCreateMat( 1, var_count + 1, CV_32F );
	weak_responses = cvCreateMat( 1, boost.get_weak_predictors()->total, CV_32F ); 

	// compute prediction error on train and test data
	for( i = 0; i < nsamples_all; i++ )
		int best_class = 0;
		double max_sum = -DBL_MAX;
		double r;
		CvMat sample;
		cvGetRow( data, &sample, i );
		for( k = 0; k < var_count; k++ )
			temp_sample->data.fl[k] =[k];

		for( j = 0; j < class_count; j++ )
			temp_sample->data.fl[var_count] = (float)j;
			boost.predict( temp_sample, 0, weak_responses );
			double sum = cvSum( weak_responses ).val[0];
			if( max_sum < sum )
				max_sum = sum;
				best_class = j + 'A';

		r = fabs(best_class - responses->data.fl[i]) < FLT_EPSILON ? 1 : 0;

		if( i < ntrain_samples )
			train_hr += r;
			test_hr += r;

	test_hr /= (double)(nsamples_all-ntrain_samples);
	train_hr /= (double)ntrain_samples;
	printf( "Recognition rate: train = %.1f%%, test = %.1f%%\n",
		train_hr*100., test_hr*100. );

	printf( "Number of trees: %d\n", boost.get_weak_predictors()->total );

	// Save classifier to file if needed
	if( filename_to_save ) filename_to_save );

	cvReleaseMat( &temp_sample );
	cvReleaseMat( &weak_responses );
	cvReleaseMat( &var_type );
	cvReleaseMat( &data );
	cvReleaseMat( &responses );

	return 0;
