根据老师的要求,基于[-1,1]上的均匀分布随机生成20个样本,相应的输出有20%的错误率;20个样本将数轴分为21段,中间有19段,θ取这19段的中点当中的一个,s取1或-1。遍历所有的θ和s,找到E-in最小的(θ,s)组合就得到了那个g函数。
#include <iostream> #include <ctime> #include <vector> #include <algorithm> using namespace std; #define SAMPLE_SIZE 20 //样本量 struct Hypothesis{ int coef; double threshold; }; //求数字的符号 int sign(double x) { if(x<0) return -1; else if(x>0) return 1; else return -1; } //反转数字的符号 int flipSign(int num) { return num * (-1); } //计算样本错误率 double calErrInSample(vector<double>& inputVec, vector<int>& outputVec, Hypothesis & hypo) { int errCount = 0; for(int i=0;i<SAMPLE_SIZE;++i){ if(outputVec[i] != hypo.coef*sign(inputVec[i]-hypo.threshold)){ errCount++; } } return double(errCount)/double(SAMPLE_SIZE); } //计算全局错误率 double calErrOutSample(Hypothesis & hypo) { return 0.5 + 0.3 * double(hypo.coef) * (abs(hypo.threshold)-1.0); } //产生[-1,1]之间均匀分布的随机数 double getRand() { return 2.0 * double(rand()) / double(RAND_MAX) - 1.0; } //生成训练数据 void getTrainingData(vector<double>& inputVec) { for(int i=0;i<SAMPLE_SIZE;++i){ inputVec.push_back(getRand()); } //对inputVec做升序排列(sort函数默认升序) sort(inputVec.begin(),inputVec.end()); } //根据f(x)产生输出,有20%的出错率 void calOutput(vector<double>& inputVec, vector<int>& outputVec) { int output; double randNum; for(int i=0;i<SAMPLE_SIZE;++i){ randNum = double(rand()) / double(RAND_MAX); //[0,1]; output = sign(inputVec[i]); if(randNum<=0.2){ output = flipSign(output); } outputVec.push_back(output); } } //遍历所有θ,找到最小的E-in并返回 double getMinErrIn(vector<double>& inputVec, vector<int>& outputVec, Hypothesis & hypo, double & bestThres ) { double minErrIn = 1.0; double curErrIn; for(int i=0;i<SAMPLE_SIZE-1;++i){ hypo.threshold = double(inputVec[i]+inputVec[i+1])/2.0; curErrIn = calErrInSample(inputVec,outputVec,hypo); if(curErrIn<minErrIn){ minErrIn = curErrIn; bestThres = hypo.threshold; } } return minErrIn; } //Decision Stump 算法, 确定s和θ double decisionStump( vector<double>& inputVec, vector<int>& outputVec, Hypothesis & hypo ) { double minErrInPositive = 1.0; double minErrInNegtive = 1.0; double minErrIn; double bestThresPositive; double bestThresNegtive; hypo.coef = 1; minErrInPositive = getMinErrIn(inputVec,outputVec,hypo,bestThresPositive); hypo.coef = -1; minErrInNegtive = getMinErrIn(inputVec,outputVec,hypo,bestThresNegtive); if(minErrInPositive<minErrInNegtive){ hypo.coef = 1; minErrIn = minErrInPositive; hypo.threshold = bestThresPositive; }else{ hypo.coef = -1; minErrIn = minErrInNegtive; hypo.threshold = bestThresNegtive; } return minErrIn; } void main() { srand((unsigned)time(NULL)); double errInTotal = 0.0; double errOutTotal = 0.0; for(int i=0;i<5000;++i){ vector<double> inputVec; vector<int> outputVec; Hypothesis hypo; getTrainingData(inputVec); calOutput(inputVec,outputVec); errInTotal += decisionStump(inputVec,outputVec,hypo); errOutTotal += calErrOutSample(hypo); cout<<"-----------------第"<<i+1<<"次计算结束-------------------\n"; cout<<"s = "<<hypo.coef<<endl; cout<<"θ= "<<hypo.threshold<<endl; } cout<<"Average E-in = "<<errInTotal/5000<<"\n"; cout<<"Average E-out = "<<errOutTotal/5000<<"\n"; }输出结果
这一题把16题中的 decision stump 拓展到多维,要求找出E-in最小的那一维并在测试数据上计算对应维度的E-out:
#include <iostream> #include <ctime> #include <cmath> #include <vector> #include <algorithm> using namespace std; #define DEMENSION 9 //数据维度 char *file = "training.txt"; char *file_test = "testing.txt"; struct record { double input[DEMENSION]; int output; }; struct singleDemensionRecord { double input; int output; }; struct Hypothesis{ int coef; double threshold; }; //求数字的符号 int sign(double x) { if(x<0) return -1; else if(x>0) return 1; else return -1; } //从文件读取数据 void getData(ifstream & dataFile, vector<record> &data) { while(!dataFile.eof()){ record curRecord; for(int i=0;i<DEMENSION;++i){ dataFile>>curRecord.input[i]; } dataFile>>curRecord.output; data.push_back(curRecord); } dataFile.close(); } //计算指定维度的样本错误率 double calErr(vector<singleDemensionRecord>& singleDemensionVec, vector<Hypothesis>& hypo, int demension) { int errCount = 0; int length = singleDemensionVec.size(); for(int i=0;i<length;++i){ if(singleDemensionVec[i].output != hypo[demension-1].coef*sign(singleDemensionVec[i].input-hypo[demension-1].threshold)){ errCount++; } } return double(errCount)/double(length); } //single demension record的比较函数 bool recCompare(singleDemensionRecord & a, singleDemensionRecord & b) { return a.input<b.input; } //将指定维度的数据提取出来并升序排列 void getInputByDemension(vector<record>& dataSet, vector<singleDemensionRecord>& singleDemensionVec, int demension) { int recordSize = dataSet.size(); singleDemensionRecord curRec; for(int i=0;i<recordSize;++i){ curRec.input = dataSet[i].input[demension-1]; curRec.output = dataSet[i].output; singleDemensionVec.push_back(curRec); } sort(singleDemensionVec.begin(),singleDemensionVec.end(),recCompare); } //遍历所有θ,找到最小的E-in并返回 double getMinErrIn(vector<singleDemensionRecord> & singleDemensionVec, vector<Hypothesis>& hypo, int demension, double & bestThres) { double minErrIn = 1.0; double curErrIn; int recordSize = singleDemensionVec.size(); for(int i=0;i<recordSize-1;++i){ hypo[demension-1].threshold = double(singleDemensionVec[i].input+singleDemensionVec[i+1].input)/2.0; curErrIn = calErr(singleDemensionVec,hypo,demension); if(curErrIn<minErrIn){ minErrIn = curErrIn; bestThres = hypo[demension-1].threshold; } } return minErrIn; } //Decision Stump 算法, 确定s和θ void decisionStump(vector<record>& trainingSet, vector<record>& testSet, vector<Hypothesis>& hypo) { int recordSize = trainingSet.size(); int minErrInDem; double minErrIn = 1.1; for(int dem=0;dem<DEMENSION;++dem){ vector<singleDemensionRecord> singleDemensionVec; double curMinErrIn; double bestThresPositive; double bestThresNegtive; double minErrInPositive; double minErrInNegtive; getInputByDemension(trainingSet,singleDemensionVec,dem+1); hypo[dem].coef = 1; minErrInPositive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresPositive); hypo[dem].coef = -1; minErrInNegtive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresNegtive); if(minErrInPositive<minErrInNegtive){ hypo[dem].coef = 1; curMinErrIn = minErrInPositive; hypo[dem].threshold = bestThresPositive; }else{ hypo[dem].coef = -1; curMinErrIn = minErrInNegtive; hypo[dem].threshold = bestThresNegtive; } if(minErrIn>curMinErrIn){ minErrIn = curMinErrIn; minErrInDem = dem+1; } } cout<<"The demension with min error is : "<<minErrInDem<<endl; cout<<"min E-in = "<<minErrIn<<endl; vector<singleDemensionRecord> singleDemensionTestVec; getInputByDemension(testSet,singleDemensionTestVec,minErrInDem); cout<<"min E-out = "<<calErr(singleDemensionTestVec,hypo,minErrInDem)<<endl<<endl; } void main() { srand((unsigned)time(NULL)); vector<record> trainingSet; //训练数据 vector<record> testSet; //测试数据 vector<Hypothesis> hypoVec(DEMENSION); //每个维度一个hypothesis ifstream dataFile(file); ifstream testDataFile(file_test); if( dataFile.is_open() && testDataFile.is_open() ){ getData(dataFile,trainingSet); getData(testDataFile,testSet); }else{ cerr<<"ERROR ---> 文件打开失败"<<endl; exit(1); } decisionStump(trainingSet,testSet,hypoVec); }输出结果:
关于Machine Learning更多讨论与交流,敬请关注本博客和新浪微博songzi_tea.