日撸代码300行:第66-68天(主动学习之 ALEC)

 代码来自闵老师”日撸 Java 三百行(61-70天)

日撸 Java 三百行(61-70天,决策树与集成学习)_闵帆的博客-CSDN博客


Step 1. 将对象按代表性递减排序;
Step 2. 假设当前数据块有 N个对象, 选择最具代表性的前\sqrt{N}个查询其标签 (类别).
Step 3. 如果这\sqrt{N}个标签具有相同类别, 就认为该块为纯的, 其它对象均分类为同一类别. 结束.
Step 4. 如果不纯,将当前块划分为两个子块, 分别 Goto Step 3.

package machinelearning.activelearning;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.Instances;

public class Alec {
	 * The whole data set.
	Instances dataset;
	 * The maximal number of queries that can be provided.
	int maxNumQuery;
	 * The actual number of queries.
	int numQuery;
	 * The radius, also dc in the paper. It is employed for density computation.
	double radius;
	 * The densities of instances, also rho in the paper.
	double[] densities;
	 * Distance to master
	double[] distanceToMaster;
	 * Sorted indices, where the first element indicates the instance with the biggest density.
	int[] descendantDensities;
	 * Priority
	double[] priority;
	 * The maximal distance between any pair of points.
	double maximalDistance;
	 * Who is my master?
	int[] masters;
	 * Predicted labels.
	int[] predictedLabels;
	 * Instance status. 0 for unprocessed, 1 for queried, 2 for classified.
	int[] instanceStatusArray;
	 * The descendant indices to show the representativeness of instances in a descendant order.
	int[] descendantRepresentatives;
	 * Indicate the cluster of each instance. It is only used in clusterInTwo(int[]);
	int[] clusterIndices;
	 * Blocks with size no more than this threshold should not be split further.
	int smallBlockThreshold = 3;
	 * *********************************************************
	 * The constructor.
	 * @param paraFilename
	 * *********************************************************
	public Alec(String paraFilename) {
		// TODO Auto-generated constructor stub
		try {
			FileReader tempReader = new FileReader(paraFilename);
			dataset = new Instances(tempReader);
			dataset.setClassIndex(dataset.numAttributes() - 1);
		} catch (Exception e) {
			// TODO: handle exception
		}//of try
		clusterIndices = new int[dataset.numInstances()];
	}//of the constructor
	 * ***********************************************************
	 * * Merge sort in descendant order to obtain an index array. The original
	 * array is unchanged. The method should be tested further. 
* Examples: input [1.2, 2.3, 0.4, 0.5], output [1, 0, 3, 2].
* input [3.1, 5.2, 6.3, 2.1, 4.4], output [2, 1, 4, 0, 3]. * * @param paraArray The original array * @return The sorted indices. * *********************************************************** */ public static int[] mergeSortToIndices(double[] paraArray) { int tempLength = paraArray.length; int[][] resultMatrix = new int[2][tempLength]; //Initialize(这里初始化第一组就够了,第二组的数据在排序的时候是从第一组复制的) int tempIndex = 0; for (int i = 0; i < tempLength; i++) { resultMatrix[tempIndex][i] = i; }//of for i // Merge int tempCurrentLength = 1; // The indices for current merged groups. int tempFirstStart, tempSecondStart, tempSecondEnd; while (tempCurrentLength < tempLength) { // Divide into a number of groups. // Here the boundary is adaptive to array length not equal to 2^k. //Math.ceil()的作用是上取整,返回的是总共分了多少个“两组”数据。每进行一次for循环,完成一轮归并,当while条件不满足时,已经完成排序。 //每一次是进行两组数的归并,所以每一轮的组数是(tempLength + 0.0)/(tempCurrentLength * 2);每一组数据的长度是tempCurrentLength for (int i = 0; i < Math.ceil((tempLength + 0.0) / tempCurrentLength /2); i++) { // Boundaries of the group tempFirstStart = i * tempCurrentLength * 2; tempSecondStart = tempFirstStart + tempCurrentLength; tempSecondEnd = tempSecondStart + tempCurrentLength - 1; if (tempSecondEnd >= tempLength) { tempSecondEnd = tempLength - 1; }//of if // Merge this group int tempFirstIndex = tempFirstStart; int tempSecondIndex = tempSecondStart; int tempCurrentIndex = tempFirstStart; if (tempSecondStart >= tempLength) { for (int j = tempFirstIndex; j < tempLength; j++) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j]; tempFirstIndex ++; tempCurrentIndex ++; }//of for j break; }//of if while ((tempFirstIndex <= tempSecondStart - 1) && (tempSecondIndex <= tempSecondEnd)) { if (paraArray[resultMatrix[tempIndex % 2][tempFirstIndex]] <= paraArray[resultMatrix[tempIndex % 2][tempSecondIndex]]) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][tempSecondIndex]; tempSecondIndex ++; }else { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][tempFirstIndex]; tempFirstIndex ++; }//of if tempCurrentIndex ++; }//of while // Remaining part for (int j = tempFirstIndex; j < tempSecondStart; j++) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j]; tempCurrentIndex++; }//of for j for (int j = tempSecondIndex; j <= tempSecondEnd; j++) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j]; tempCurrentIndex++; }//of for j }//of for i tempCurrentLength *= 2; tempIndex ++; //交替使用两个数组为当前组,另一个数组用于新一轮排序时复制索引号。 }//of while return resultMatrix[tempIndex % 2]; }//of mergeSortToIndices /** * ********************************************************************** * The Euclidean distance between two instances. Other distance measures * unsupported for simplicity. * * @param paraI The index of the first instance. * @param paraJ The index of the second instance. * @return The distance. * ********************************************************************** */ public double distance(int paraI, int paraJ) { double resultDistance = 0; double tempDifference; for (int i = 0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i); resultDistance += tempDifference * tempDifference; }//of for i resultDistance = Math.sqrt(resultDistance); return resultDistance; }//of distance /** * ***************************************************************** * Compute the maximal distance. The result is stored in a member variable. * ***************************************************************** */ public void computeMaximalDistance() { maximalDistance = 0; double tempDistance; for (int i = 0; i < dataset.numInstances(); i++) { for (int j = 0; j < dataset.numInstances(); j++) { tempDistance = distance(i, j); if (maximalDistance < tempDistance) { maximalDistance = tempDistance; }//of if }//of for j }//of for i System.out.println("maximalDistance = " + maximalDistance); }//of computeMaximalDistance /** * **************************************************************** * Compute the densities using Gaussian kernel. * 这里不同于原文的密度峰值聚类,这里用的高斯核函数计算实例的密度。密度峰值是设定一个距离阈值,在距离范围内的实例个数就是当前实例的密度。 * * @param paraBlock The given block. * **************************************************************** */ public void computeDensitiesGaussian() { System.out.println("radius = " + radius); densities = new double[dataset.numInstances()]; double tempDistance; for (int i = 0; i < dataset.numInstances(); i++) { for (int j = 0; j < dataset.numInstances(); j++) { tempDistance = distance(i, j); densities[i] += Math.exp(-tempDistance * tempDistance /(radius * radius)); }//of for j }//of for i System.out.println("The densities are " + Arrays.toString(densities) + "\r\n"); }//of computeDensitiesGaussian /** * *************************************************************** * Compute distanceToMaster, the distance to its master. * *************************************************************** */ public void computeDistanceToMaster() { distanceToMaster = new double[dataset.numInstances()]; masters = new int[dataset.numInstances()]; descendantDensities = new int[dataset.numInstances()]; instanceStatusArray = new int[dataset.numInstances()]; descendantDensities = mergeSortToIndices(densities); distanceToMaster[descendantDensities[0]] = maximalDistance; double tempDistance; for (int i = 1; i < dataset.numInstances(); i++) { // Initialize. distanceToMaster[descendantDensities[i]] = maximalDistance; //只有密度比自己大的才可能是自己的Master,所以排序在i以后实例不用计算。 for (int j = 0; j <= i - 1; j++) { tempDistance = distance(descendantDensities[i], descendantDensities[j]); if (distanceToMaster[descendantDensities[i]] > tempDistance) { distanceToMaster[descendantDensities[i]] = tempDistance; masters[descendantDensities[i]] = descendantDensities[j]; }//of if }//of for j }//of for i System.out.println("First compute, masters = " + Arrays.toString(masters)); System.out.println("descendantDensities = " + Arrays.toString(descendantDensities)); }//of computeDistanceToMaster /** * **************************************************************** * Compute priority. Element with higher priority is more likely to be * selected as a cluster center. Now it is rho * distanceToMaster. It can * also be rho^alpha * distanceToMaster. * **************************************************************** */ public void computePriority() { priority = new double[dataset.numInstances()]; for (int i = 0; i < dataset.numInstances(); i++) { priority[i] = densities[i] * distanceToMaster[i]; }//of for i }//of computePriority /** * ******************************************************************* * The block of a node should be same as its master. This recursive method is efficient. * @param paraIndex The index of the given node. * @return The cluster index of the current node. * ******************************************************************* */ public int coincideWithMaster(int paraIndex) { if (clusterIndices[paraIndex] == -1) { int tempMaster = masters[paraIndex]; clusterIndices[paraIndex] = coincideWithMaster(tempMaster); }//of if return clusterIndices[paraIndex]; }//of coincideWithMaster /** * ********************************************************************* * Cluster a block in two. According to the master tree. * * @param paraBlock The given block. * @return The new blocks where the two most represent instances serve as the root. * ********************************************************************* */ public int[][] clusterInTwo(int[] paraBlock) { // Reinitialize. In fact, only instances in the given block is considered. Arrays.fill(clusterIndices, -1); // Initialize the cluster number of the two roots. //这里把数组paraBlock的前两个元素存储的序号,所对应的簇标签分别设置为0和1. //paraBlock的前两个序号对应的原始数据集的实例标签如果相同,这里有没有影响??? for (int i = 0; i < 2; i++) { clusterIndices[paraBlock[i]] = i; }//of for i for (int i = 0; i < paraBlock.length; i++) { if (clusterIndices[paraBlock[i]] != -1) { continue; }//of if //i实例和自己的Master具有相同的类标签。 clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]); }//of for i //The sub blocks. int[][] resultBlock = new int[2][]; int tempFirstBlockCount = 0; //长度是clusterIndices.length,此时没在paraBlock中的实例,上面代码已经填充了-1. //只有paraBlock[i]才有数据。如果这里长度设置为paraBlock.length,下面判断条件应该是clusterIndices[paraBlock[i]] == 0。 //否则i无法将所有实例全部遍历。 for (int i = 0; i = tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) { System.out.println("" + tempNumQuery + " instances are queried, vote for block: \r\n" + Arrays.toString(paraBlock)); vote(paraBlock); return; }//of if // Step 3. Query enough labels. for (int i = 0; i < tempExpectedQueries; i++) { if (numQuery >= maxNumQuery) { System.out.println("No more queries are provided, numQuery = " + numQuery + "."); vote(paraBlock); return; }//of if if (instanceStatusArray[paraBlock[i]] == 0) { instanceStatusArray[paraBlock[i]] = 1; predictedLabels[paraBlock[i]] = (int)dataset.instance(paraBlock[i]).classValue(); numQuery ++; }//of if }//of for i //Step 4. Pure? int tempFirstLabel = predictedLabels[paraBlock[0]]; boolean tempPure = true; for (int i = 1; i < tempExpectedQueries; i++) { if (predictedLabels[paraBlock[i]] != tempFirstLabel) { tempPure = false; break; }//of if }//of for i if (tempPure) { System.out.println("Classify for pure block: " + Arrays.toString(paraBlock)); for (int i = tempExpectedQueries; i < paraBlock.length; i++) { if (instanceStatusArray[paraBlock[i]] == 0) { instanceStatusArray[paraBlock[i]] = 2; predictedLabels[paraBlock[i]] = tempFirstLabel; }//of if }//of for i return; }//of if // Step 5. Split in two and process them independently. int[][] tempBlocks = clusterInTwo(paraBlock); for (int i = 0; i < 2; i++) { clusterBasedActiveLearning(tempBlocks[i]); }//of for i }//of clusterBasedActiveLearning /** ****************************************************** * Show the statistics information. ****************************************************** */ public String toString() { int[] tempStatusCounts = new int[3]; double tempCorrect = 0; for (int i = 0; i < dataset.numInstances(); i++) { tempStatusCounts[instanceStatusArray[i]]++; if (predictedLabels[i] == (int) dataset.instance(i).classValue()) { tempCorrect ++; }//of if }//of for i String resultString = "(unhandled, queried, classified) = " + Arrays.toString(tempStatusCounts); resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = " + (tempCorrect / dataset.numInstances()); return resultString; }//ofr toString /** * ******************************************************************** * The entrance of the program. * * @param args * ******************************************************************** */ public static void main(String args[]) { long tempStart = System.currentTimeMillis(); System.out.println("Starting ALEC."); String arffFileName = "E:/Datasets/UCIdatasets/其他数据集/iris.arff"; Alec tempAlec = new Alec(arffFileName); // The settings for iris tempAlec.clusterBasedActiveLearning(0.15, 30, 3); System.out.println(tempAlec); long tempEnd = System.currentTimeMillis(); System.out.println("Runtime: " + (tempEnd - tempStart) + "ms."); }//of main }//of Alec

