使用的数据是最简单的weather.nominal.arff,使用的分类模型是NaiveBayes
m_ClassPriors[numClasses]统计构造函数传入的data的各个类的先验概率,有一个平滑处理,在统计之前每个m_ClassPriors[i]都赋值为1,防止出现某个类先验概率为0的情况出现。
m_MarginCounts = new double[500 + 1]
首先是随机打乱数据,并且如果类是标称型的就要进行分层操作(类似分层抽样):
data.randomize(random);
if (data.classAttribute().isNominal()) {
data.stratify(numFolds);
}
下面这个函数的主体:
for (int i = 0; i < numFolds; i++) {
Instances train = data.trainCV(numFolds, i, random);
setPriors(train);//这里的设置的先验概率会覆盖前面构造函数中的设置
Classifier copiedClassifier = AbstractClassifier.makeCopy(classifier);
copiedClassifier.buildClassifier(train);
Instances test = data.testCV(numFolds, i);
evaluateModel(copiedClassifier, test, forPredictionsPrinting);
}
m_NumFolds = numFolds;
double predictions[] = new double[data.numInstances()];
for (int i = 0; i < data.numInstances(); i++) {
predictions[i] =
evaluateModelOnceAndRecordPrediction(classifier, data.instance(i));
}
return predictions;//返回所有测试实例的预测的最大概率的类标记
Instance classMissing = (Instance) instance.copy();
classMissing.setDataset(instance.dataset());
classMissing.setClassMissing();//把类标记抹去
double pred =
evaluationForSingleInstance(
classifier.distributionForInstance(classMissing), instance,
storePredictions);//见5.
return pred;
updateStatsForClassifier(dist, instance);//见6.
if (storePredictions && !m_DiscardPredictions) {
if (m_Predictions == null) {
m_Predictions = new ArrayList();
}
m_Predictions.add(new NominalPrediction(instance.classValue(), dist,
instance.weight()));
}//m_Predictions中每一项放置一个测试实例的预测类的概率分布
int actualClass = (int) instance.classValue();
if (!instance.classIsMissing()) {
updateMargins(predictedDistribution, actualClass, instance.weight());//见7.
int predictedClass = -1;
double bestProb = 0.0;
for (int i = 0; i < m_NumClasses; i++) {
if (predictedDistribution[i] > bestProb) {
predictedClass = i;
bestProb = predictedDistribution[i];
}
}
double probActual = predictedDistribution[actualClass];
double probNext = 0;
for (int i = 0; i < m_NumClasses; i++) {
if ((i != actualClass) && (predictedDistribution[i] > probNext)) {
probNext = predictedDistribution[i];
}
}//proNext就是除了真实类别外概率最大的类的概率
double margin = probActual - probNext;
int bin = (int) ((margin + 1.0) / 2.0 * k_MarginResolution);
m_MarginCounts[bin] += weight;
ArrayList
是在evaluationForSingleInstance函数中更新的。