Weka中的默认基分类器使用的是REPTree,也就是Fast decision tree learner,至于这个具体是个什么,后面我再写文章进行分析。
public Bagging() { m_Classifier = new weka.classifiers.trees.REPTree(); }
if (m_CalcOutOfBag && (m_BagSizePercent != 100)) { throw new IllegalArgumentException("Bag size needs to be 100% if " + "out-of-bag error is to be calculated!"); } int bagSize = data.numInstances() * m_BagSizePercent / 100; Random random = new Random(m_Seed); boolean[][] inBag = null; if (m_CalcOutOfBag) inBag = new boolean[m_Classifiers.length][]; for (int j = 0; j < m_Classifiers.length; j++) { Instances bagData = null; // create the in-bag dataset if (m_CalcOutOfBag) { inBag[j] = new boolean[data.numInstances()]; // bagData = resampleWithWeights(data, random, inBag[j]); bagData = data.resampleWithWeights(random, inBag[j]); } else { bagData = data.resampleWithWeights(random); if (bagSize < data.numInstances()) { bagData.randomize(random); Instances newBagData = new Instances(bagData, 0, bagSize); bagData = newBagData; } }这一部分是抽样,首先如果有m_CalcOutOfBag标,则必须要求抽样比例是100%。
if (m_Classifier instanceof Randomizable) { ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt()); } // build the classifier m_Classifiers[j].buildClassifier(bagData);接着是构建分类树的过程,调用具体classifier的buildClassifier方法。
if (getCalcOutOfBag()) { //如果有这个标就计算 double outOfBagCount = 0.0; //错误的权重和 double errorSum = 0.0;//错误的偏差值的和 boolean numeric = data.classAttribute().isNumeric();//是否是连续数值 for (int i = 0; i < data.numInstances(); i++) { double vote;//代表投票结果 double[] votes;//代表投票 if (numeric) votes = new double[1];//如果是数值,则取平均数,计算平均数的过程一个数组单元就够了 else votes = new double[data.numClasses()];//否则则要进行投票 // determine predictions for instance int voteCount = 0; for (int j = 0; j < m_Classifiers.length; j++) { if (inBag[j][i]) continue;//如果已经被采样,就忽略,因为要计算的是OutOfBag voteCount++;//记录有多少样本被计算 if (numeric) { votes[0] = m_Classifiers[j].classifyInstance(data.instance(i));//数值型则直接把预测结果累加 } else {
double[] newProbs = m_Classifiers[j].distributionForInstance(data .instance(i)); for (int k = 0; k < newProbs.length; k++) { votes[k] += newProbs[k]; //枚举型则要把所有枚举概率进行累加 } } } // "vote" if (numeric) { vote = votes[0]; if (voteCount > 0) { vote /= voteCount; // 数值型取均值 } } else { if (Utils.eq(Utils.sum(votes), 0)) { } else { Utils.normalize(votes);//归一化 } vote = Utils.maxIndex(votes); // 选出最大的index } outOfBagCount += data.instance(i).weight();//累加权重 if (numeric) { errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();//累加错误偏差 } else { if (vote != data.instance(i).classValue()) errorSum += data.instance(i).weight();//如果是枚举就对出错进行计数 } } m_OutOfBagError = errorSum / outOfBagCount;//最后取个平均值 } else { m_OutOfBagError = 0;//如果没有那个标就不计算了 }
也就是 data.resampleWithWeights(random, inBag[j]);这个方法,感觉看了一下还挺有意思的,就放上来剖析一下。
public Instances resampleWithWeights(Random random, double[] weights) { return resampleWithWeights(random, weights, null); }
public Instances resampleWithWeights(Random random, boolean[] sampled) { double[] weights = new double[numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = instance(i).weight(); } return resampleWithWeights(random, weights, sampled); }
public Instances resampleWithWeights(Random random, double[] weights, boolean[] sampled) { if (weights.length != numInstances()) { throw new IllegalArgumentException("weights.length != numInstances."); } Instances newData = new Instances(this, numInstances()); if (numInstances() == 0) { return newData; } // Walker's method, see pp. 232 of "Stochastic Simulation" by B.D. Ripley double[] P = new double[weights.length]; System.arraycopy(weights, 0, P, 0, weights.length); Utils.normalize(P); double[] Q = new double[weights.length]; int[] A = new int[weights.length]; int[] W = new int[weights.length]; int M = weights.length; int NN = -1; int NP = M; for (int I = 0; I < M; I++) { if (P[I] < 0) { throw new IllegalArgumentException("Weights have to be positive."); } Q[I] = M * P[I]; if (Q[I] < 1.0) { W[++NN] = I; } else { W[--NP] = I; } } if (NN > -1 && NP < M) { for (int S = 0; S < M - 1; S++) { int I = W[S]; int J = W[NP]; A[I] = J; Q[J] += Q[I] - 1.0; if (Q[J] < 1.0) { NP++; } if (NP >= M) { break; } } // A[W[M]] = W[M]; } for (int I = 0; I < M; I++) { Q[I] += I; } for (int i = 0; i < numInstances(); i++) { int ALRV; double U = M * random.nextDouble(); int I = (int) U; if (U < Q[I]) { ALRV = I; } else { ALRV = A[I]; } newData.add(instance(ALRV)); if (sampled != null) { sampled[ALRV] = true; } newData.instance(newData.numInstances() - 1).setWeight(1); } return newData; }
Walker's method, see pp. 232 of "Stochastic Simulation" by B.D. Ripley我找了半天也不知道是个啥算法,代码也没啥注释,大体一看没看懂,等下次有机会再把这个函数的算法补上吧。