ALEC 代码(含注释)

简述

  主动学习算法ALEC,如果你已经了解了概念,那么这个代码部分将非常适合你深度学习。讲解文章在点击这里。

代码

package machinelearning.activelearning;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.Instances;
import java.util.*;
public class Alec {
	/**
	 * 整个数据集
	 */
	Instances dataset;

	/**
	 * 可以提供的最大查询数。
	 */
	int maxNumQuery;

	/**
	 * 查询的实际数量。
	 */
	int numQuery;

	/**
	 * 半径, 也是论文的dc. 它用于密度计算。
	 */
	double radius;

	/**
	 * 实例的密度, 论文中的rho.
	 */
	double[] densities;

	/**
	 * distanceToMaster?
	 */
	double[] distanceToMaster;

	/**
	 * 给索引排序, 其中第一个元素表示密度最大的实例。
	 * 
	 */
	int[] descendantDensities;

	/**
	 * 优先事项
	 */
	double[] priority;

	/**
	 * 任意两点之间的最大距离。
	 */
	double maximalDistance;

	/**
	 * 上级,簇
	 */
	int[] masters;

	/**
	 * 预测标签
	 */
	int[] predictedLabels;

	/**
	 * 实例状态。0表示未处理,1表示已查询,2表示已分类。
	 */
	int[] instanceStatusArray;

	/**
	 * 子代索引以子代顺序显示实例的代表性。
	 * 
	 */
	int[] descendantRepresentatives;

	/**
	 * 指出每个实例的集群. 它仅被用于clusterInTwo(int[]);
	 */
	int[] clusterIndices;

	/**
	 * 大小不超过此阈值的块不应进一步拆分。
	 */
	int smallBlockThreshold = 3;

	/**
	 ********************************** 
	 * 构造方法,传送数据
	 * 
	 * @param paraFilename
	 *            The data filename.
	 ********************************** 
	 */
	public Alec(String paraFilename) {
		try {
			FileReader tempReader = new FileReader(paraFilename);
			dataset = new Instances(tempReader);
			dataset.setClassIndex(dataset.numAttributes() - 1);
			tempReader.close();
		} catch (Exception ee) {
			System.out.println(ee);
			System.exit(0);
		} // Of fry
		computeMaximalDistance();
		clusterIndices = new int[dataset.numInstances()];// 空间变成了数据集的大小
	}// Of the constructor

	/**
	 ********************************** 
	 * 归并排序
	 * @param paraArray
	 *            the original array
	 * @return 排好序的数组索引。
	 ********************************** 
	 */
	public static int[] mergeSortToIndices(double[] paraArray) {
		// int paraArray[] = {6,5,7,2,4,3,1,0,8};
		int tempLength = paraArray.length;
		int[][] resultMatrix = new int[2][tempLength];// 用于归并排序,两个数组来回跳跃。

		// 初始化
		int tempIndex = 0;
		for (int i = 0; i < tempLength; i++) {
			resultMatrix[tempIndex][i] = i;
		} // Of for i

		// 归并
		int tempCurrentLength = 1;
		// 当前合并组的索引。
		int tempFirstStart, tempSecondStart, tempSecondEnd;

		while (tempCurrentLength < tempLength) {
			// 分成两组
			// 这里的边界自适应于不等于2^k的数组长度。math.ceil(x)返回大于等于参数x的最小整数,即对浮点数向上取整.
			for (int i = 0; i < Math.ceil((tempLength + 0.0) / tempCurrentLength / 2); i++) {
				// 块的界限
				tempFirstStart = i * tempCurrentLength * 2; // 从零开始

				tempSecondStart = tempFirstStart + tempCurrentLength; // 从第一块的第一个开始,+ 长度就会到达第二个开始点

				tempSecondEnd = tempSecondStart + tempCurrentLength - 1; // 记录第二块结束位置,第二块开始+长度-1
				if (tempSecondEnd >= tempLength) { // 如果第二次end 长度超过了数组的长度,下面一句进行回退。
					tempSecondEnd = tempLength - 1; 
				} // Of if

				// 对这个组进行归并,把开始的两个引用,复制给两次开始的索引,tempcurrentIndex也是第一次开始的引用
				int tempFirstIndex = tempFirstStart; 
				int tempSecondIndex = tempSecondStart;
				int tempCurrentIndex = tempFirstStart;

				if (tempSecondStart >= tempLength) { // 只够一个块,直接复制进去
					for (int j = tempFirstIndex; j < tempLength; j++) { // 就从第元素块开始,遍历
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
								% 2][j]; // 把数组填满。
						tempFirstIndex++;
						tempCurrentIndex++;
					} // Of for j
					break;
				} // Of if

				while ((tempFirstIndex <= tempSecondStart - 1)// 如果第一个块的位置,没有和第二个块开始的位置衔接,
						&& (tempSecondIndex <= tempSecondEnd)) { // 第二个块开始的位置没有和第二个块结束的位置衔接,那么执行迭代 

					if (paraArray[resultMatrix[tempIndex% 2][tempFirstIndex]] >= paraArray[resultMatrix[tempIndex% 2][tempSecondIndex]]) {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][tempFirstIndex];
						tempFirstIndex++;
					} else {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][tempSecondIndex];
						tempSecondIndex++; // 比较数组指针移动
					} // Of if
					tempCurrentIndex++; // 索引数组指针移动
				} // Of while

				// 剩余部分拷贝进去。
				for (int j = tempFirstIndex; j < tempSecondStart; j++) {
					resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][j];
					tempCurrentIndex++;
				} // Of for j
				for (int j = tempSecondIndex; j <= tempSecondEnd; j++) {
					resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][j];
					tempCurrentIndex++;
				} // Of for j
			} // Of for i

			tempCurrentLength *= 2;
			tempIndex++;
		} // Of while
System.out.println(Arrays.toString(resultMatrix));
		return resultMatrix[tempIndex % 2];
	}// Of mergeSortToIndices

	/**
	 *********************
	 * 两个实例的欧式距离。为简单起见,不支持其他距离度量。
	 * @param paraI
	 *            第一个实例的索引
	 * @param paraJ
	 *            第二个实例的索引
	 * @return The distance.
	 *********************
	 */
	public double distance(int paraI, int paraJ) {
		double resultDistance = 0;
		double tempDifference;
		for (int i = 0; i < dataset.numAttributes() - 1; i++) { // -1 减掉了决策属性。
			tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);// 向量相减
			resultDistance += tempDifference * tempDifference;// 欧氏距离公式:√ (x1-x1)^2+...+(x_n - x_n)^2
		} // Of for i
		resultDistance = Math.sqrt(resultDistance);

		return resultDistance;
	}// Of distance

	/**
	 ********************************** 
	 * 计算出最大距离, 结果存储在成员变量中。
	 ********************************** 
	 */
	public void computeMaximalDistance() {
		maximalDistance = 0;
		double tempDistance;
		for (int i = 0; i < dataset.numInstances(); i++) { // 对每一个变量进行循环,找出最大值
			for (int j = 0; j < dataset.numInstances(); j++) {
				tempDistance = distance(i, j);// 计算两点直接的距离
				if (maximalDistance < tempDistance) {
					maximalDistance = tempDistance;
				} // Of if
			} // Of for j
		} // Of for i

		System.out.println("maximalDistance = " + maximalDistance);
	}// Of computeMaximalDistance

	/**
	 ****************** 
	 * 利用高斯核来计算密度。
	 * 
	 * @param paraBlock
	 *            The given block.
	 ****************** 
	 */
	public void computeDensitiesGaussian() {
		System.out.println("radius = " + radius);
		densities = new double[dataset.numInstances()];// 实例的密度数组它的大小是实例数目
		double tempDistance;

		for (int i = 0; i < dataset.numInstances(); i++) {
			for (int j = 0; j < dataset.numInstances(); j++) {
				tempDistance = distance(i, j);// 计算出两点之间的距离
				densities[i] += Math.exp(-tempDistance * tempDistance / radius / radius);
			} // Of for j
		} // Of for i

		System.out.println("The densities are " + Arrays.toString(densities) + "\r\n");
	}// Of computeDensitiesGaussian

	/**
	 ********************************** 
	 * 计算这个实例和大哥的距离
	 ********************************** 
	 */
	public void computeDistanceToMaster() {
		distanceToMaster = new double[dataset.numInstances()]; // 到大哥之间的距离
		masters = new int[dataset.numInstances()]; // 大哥数组
		descendantDensities = new int[dataset.numInstances()]; // 降序排列的密度
		instanceStatusArray = new int[dataset.numInstances()]; // 状态数组

		descendantDensities = mergeSortToIndices(densities); // 对密度进行降序排列,然后返回索引
		distanceToMaster[descendantDensities[0]] = maximalDistance; // 密度最大的点没有大哥,他就是爷(根节点)

		double tempDistance;
		for (int i = 1; i < dataset.numInstances(); i++) { // 从第二高的密度点去寻找自己的老大
			// 初始化
			distanceToMaster[descendantDensities[i]] = maximalDistance; // 初始化,和大哥的距离设置为最远
			for (int j = 0; j <= i - 1; j++) { // 我们找大哥,一定要找密度比自己大的点,不然凭什么做我的大哥呢?
				tempDistance = distance(descendantDensities[i], descendantDensities[j]); // 计算距离
				if (distanceToMaster[descendantDensities[i]] > tempDistance) {
					distanceToMaster[descendantDensities[i]] = tempDistance;
					masters[descendantDensities[i]] = descendantDensities[j]; // 选择最近的距离作为我的大哥。
				} // Of if 
			} // Of for j
		} // Of for i
		System.out.println("First compute, masters = " + Arrays.toString(masters));
		System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
	}// Of computeDistanceToMaster

	/**
	 ********************************** 
	 * Compute priority. Element with higher priority is more likely to be
	 * selected as a cluster center. Now it is rho * distanceToMaster. It can
	 * also be rho^alpha * distanceToMaster.
	 ********************************** 
	 */
	public void computePriority() {
		priority = new double[dataset.numInstances()];
		for (int i = 0; i < dataset.numInstances(); i++) {
			priority[i] = densities[i] * distanceToMaster[i];
		} // Of for i
	}// Of computePriority

	/**
	 ************************* 
	 *一个块中的结点应该和老大一样.这个递归方法是高效的
	 * 
	 * 
	 * @param paraIndex
	 *            结点索引
	 * @return 当前结点的簇索引。
	 ************************* 
	 */
	public int coincideWithMaster(int paraIndex) {
		if (clusterIndices[paraIndex] == -1) {
			int tempMaster = masters[paraIndex];// 找出这个结点的老大的索引
			clusterIndices[paraIndex] = coincideWithMaster(tempMaster);// 老大到追溯到顶
		} // Of if

		return clusterIndices[paraIndex]; // 返回这个实例的老大。
	}// Of coincideWithMaster

	/**
	 ************************* 
	 * 根据大佬,把这个块分成两个。
	 * 
	 * @param paraBlock
	 *            The given block.
	 * @return The new blocks where the two most represent instances serve as
	 *         the root.
	 ************************* 
	 */
	public int[][] clusterInTwo(int[] paraBlock) {
		//  初始化
		// 聚类
		Arrays.fill(clusterIndices, -1);// 开始这个块是空的

		// 把这个块分成两个,前两个实例是密度最高的实例,最能成为子块的代表
		for (int i = 0; i < 2; i++) {
			clusterIndices[paraBlock[i]] = i;
		} // Of for i // 两个子块成为老大,旗号 0 ,1

		for (int i = 0; i < paraBlock.length; i++) {
			if (clusterIndices[paraBlock[i]] != -1) {
				// 已经有所属关系了
				continue;
			} // Of if

			clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
		} // Of for i

		// 子块
		int[][] resultBlocks = new int[2][];
		int tempFistBlockCount = 0;
		for (int i = 0; i < clusterIndices.length; i++) {
			if (clusterIndices[i] == 0) {
				tempFistBlockCount++;
			} // Of if
		} // Of for i
		resultBlocks[0] = new int[tempFistBlockCount];// 第一个子块
		resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];// 第二个子块

		// 把这个块的数据分配到两个子块中去。
		// 
		int tempFirstIndex = 0;
		int tempSecondIndex = 0;
		for (int i = 0; i < paraBlock.length; i++) {
			if (clusterIndices[paraBlock[i]] == 0) {
				resultBlocks[0][tempFirstIndex] = paraBlock[i];
				tempFirstIndex++;
			} else {
				resultBlocks[1][tempSecondIndex] = paraBlock[i];
				tempSecondIndex++;
			} // Of if
		} // Of for i

		System.out.println("Split (" + paraBlock.length + ") instances "
				+ Arrays.toString(paraBlock) + "\r\nto (" + resultBlocks[0].length + ") instances "
				+ Arrays.toString(resultBlocks[0]) + "\r\nand (" + resultBlocks[1].length
				+ ") instances " + Arrays.toString(resultBlocks[1]));
		return resultBlocks;
	}// Of clusterInTwo

	/**
	 ********************************** 
	 * Classify instances in the block by simple voting.
	 * 
	 * @param paraBlock
	 *            The given block.
	 ********************************** 
	 */
	public void vote(int[] paraBlock) {
		int[] tempClassCounts = new int[dataset.numClasses()];
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 1) {
				tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
			} // Of if
		} // Of for i

		int tempMaxClass = -1;
		int tempMaxCount = -1;
		for (int i = 0; i < tempClassCounts.length; i++) {
			if (tempMaxCount < tempClassCounts[i]) {
				tempMaxClass = i;
				tempMaxCount = tempClassCounts[i];
			} // Of if
		} // Of for i

		// Classify unprocessed instances.
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 0) {
				predictedLabels[paraBlock[i]] = tempMaxClass;
				instanceStatusArray[paraBlock[i]] = 2;
			} // Of if
		} // Of for i
	}// Of vote

	/**
	 ********************************** 
	 * Cluster based active learning. Prepare for
	 * 
	 * @param paraRatio
	 *            The ratio of the maximal distance as the dc.
	 * @param paraMaxNumQuery
	 *            The maximal number of queries for the whole dataset.
	 * @parm paraSmallBlockThreshold The small block threshold.
	 ********************************** 
	 */
	public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery,
			int paraSmallBlockThreshold) {
		radius = maximalDistance * paraRatio; // 原来这个比例就是乘以最大的距离 表示 dc
		smallBlockThreshold = paraSmallBlockThreshold; // 一个块中的最小样本点个数

		maxNumQuery = paraMaxNumQuery; // 查询的最大个数
		predictedLabels = new int[dataset.numInstances()]; // 标签预测数组

		for (int i = 0; i < dataset.numInstances(); i++) {
			predictedLabels[i] = -1; // 对其进行初始化。
		} // Of for i

		computeDensitiesGaussian(); // 密度计算
		computeDistanceToMaster(); // 到大哥之间的距离
		computePriority(); // 优先级
		descendantRepresentatives = mergeSortToIndices(priority); // 对优先级排序,是一个降序
		System.out.println(
				"descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));

		numQuery = 0;
		clusterBasedActiveLearning(descendantRepresentatives);
	}// Of clusterBasedActiveLearning

	/**
	 ********************************** 
	 * 基于主动学习进行聚类
	 * 
	 * @param paraBlock
	 *            The given block. This block must be sorted according to the
	 *            priority in descendant order.
	 ********************************** 
	 */
	public void clusterBasedActiveLearning(int[] paraBlock) {
		System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));

		// Step 1. 这个块可以查询的最大长度
		int tempExpectedQueries = (int) Math.sqrt(paraBlock.length); // 这个块可以查询的最大长度
		int tempNumQuery = 0;
		for (int i = 0; i < paraBlock.length; i++) {// 统计查询过的数量
			if (instanceStatusArray[paraBlock[i]] == 1) {
				tempNumQuery++;
			} // Of if
		} // Of for i

		// Step 2. 如果我们的查询数量超过预期用完了,这个块的数目小于最小块我们进行投票。
		if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
			System.out.println("" + tempNumQuery + " instances are queried, vote for block: \r\n"
					+ Arrays.toString(paraBlock));
			vote(paraBlock);

			return;
		} // Of if

		// Step 3. 查询足够的标签
		for (int i = 0; i < tempExpectedQueries; i++) {
			if (numQuery >= maxNumQuery) {
				System.out.println("No more queries are provided, numQuery = " + numQuery + ".");
				vote(paraBlock);
				return;
			} // Of if

			if (instanceStatusArray[paraBlock[i]] == 0) {
				instanceStatusArray[paraBlock[i]] = 1;
				predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
				// System.out.println("Query #" + paraBlock[i] + ", numQuery = "
				// + numQuery);
				numQuery++;
			} // Of if
		} // Of for i

		// Step 4. Pure?
		int tempFirstLabel = predictedLabels[paraBlock[0]];
		boolean tempPure = true;
		for (int i = 1; i < tempExpectedQueries; i++) {
			if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
				tempPure = false;
				break;
			} // Of if
		} // Of for i
		if (tempPure) {
			System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
			for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
				if (instanceStatusArray[paraBlock[i]] == 0) {
					predictedLabels[paraBlock[i]] = tempFirstLabel;
					instanceStatusArray[paraBlock[i]] = 2;
				} // Of if
			} // Of for i
			return;
		} // Of if

		// Step 5. Split in two and process them independently.
		int[][] tempBlocks = clusterInTwo(paraBlock);
		for (int i = 0; i < 2; i++) {
			// Attention: recursive invoking here.
			clusterBasedActiveLearning(tempBlocks[i]);
		} // Of for i
	}// Of clusterBasedActiveLearning

	/**
	 ******************* 
	 * Show the statistics information.
	 ******************* 
	 */
	public String toString() {
		int[] tempStatusCounts = new int[3];
		double tempCorrect = 0;
		for (int i = 0; i < dataset.numInstances(); i++) {
			tempStatusCounts[instanceStatusArray[i]]++;
			if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
				tempCorrect++;
			} // Of if
		} // Of for i

		String resultString = "(unhandled, queried, classified) = "
				+ Arrays.toString(tempStatusCounts);
		resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = "
				+ (tempCorrect / dataset.numInstances());

		return resultString;
	}// Of toString

	/**
	 ********************************** 
	 * The entrance of the program.
	 * 
	 * @param args:
	 *            Not used now.
	 ********************************** 
	 */
	public static void main(String[] args) {
		long tempStart = System.currentTimeMillis();

		System.out.println("Starting ALEC.");
		String arffFilename = "D:/data/iris.arff";
		// String arffFilename = "D:/data/mushroom.arff";

		Alec tempAlec = new Alec(arffFilename);
		// The settings for iris
		tempAlec.clusterBasedActiveLearning(0.15, 30, 3);
		// The settings for mushroom
		// tempAlec.clusterBasedActiveLearning(0.1, 800, 3);
		System.out.println(tempAlec);

		long tempEnd = System.currentTimeMillis();
		System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
	}// Of main
}// Of class Alec

你可能感兴趣的:(机器学习代码,机器学习)