主动学习算法ALEC,如果你已经了解了概念,那么这个代码部分将非常适合你深度学习。讲解文章在点击这里。
package machinelearning.activelearning;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.Instances;
import java.util.*;
public class Alec {
/**
* 整个数据集
*/
Instances dataset;
/**
* 可以提供的最大查询数。
*/
int maxNumQuery;
/**
* 查询的实际数量。
*/
int numQuery;
/**
* 半径, 也是论文的dc. 它用于密度计算。
*/
double radius;
/**
* 实例的密度, 论文中的rho.
*/
double[] densities;
/**
* distanceToMaster?
*/
double[] distanceToMaster;
/**
* 给索引排序, 其中第一个元素表示密度最大的实例。
*
*/
int[] descendantDensities;
/**
* 优先事项
*/
double[] priority;
/**
* 任意两点之间的最大距离。
*/
double maximalDistance;
/**
* 上级,簇
*/
int[] masters;
/**
* 预测标签
*/
int[] predictedLabels;
/**
* 实例状态。0表示未处理,1表示已查询,2表示已分类。
*/
int[] instanceStatusArray;
/**
* 子代索引以子代顺序显示实例的代表性。
*
*/
int[] descendantRepresentatives;
/**
* 指出每个实例的集群. 它仅被用于clusterInTwo(int[]);
*/
int[] clusterIndices;
/**
* 大小不超过此阈值的块不应进一步拆分。
*/
int smallBlockThreshold = 3;
/**
**********************************
* 构造方法,传送数据
*
* @param paraFilename
* The data filename.
**********************************
*/
public Alec(String paraFilename) {
try {
FileReader tempReader = new FileReader(paraFilename);
dataset = new Instances(tempReader);
dataset.setClassIndex(dataset.numAttributes() - 1);
tempReader.close();
} catch (Exception ee) {
System.out.println(ee);
System.exit(0);
} // Of fry
computeMaximalDistance();
clusterIndices = new int[dataset.numInstances()];// 空间变成了数据集的大小
}// Of the constructor
/**
**********************************
* 归并排序
* @param paraArray
* the original array
* @return 排好序的数组索引。
**********************************
*/
public static int[] mergeSortToIndices(double[] paraArray) {
// int paraArray[] = {6,5,7,2,4,3,1,0,8};
int tempLength = paraArray.length;
int[][] resultMatrix = new int[2][tempLength];// 用于归并排序,两个数组来回跳跃。
// 初始化
int tempIndex = 0;
for (int i = 0; i < tempLength; i++) {
resultMatrix[tempIndex][i] = i;
} // Of for i
// 归并
int tempCurrentLength = 1;
// 当前合并组的索引。
int tempFirstStart, tempSecondStart, tempSecondEnd;
while (tempCurrentLength < tempLength) {
// 分成两组
// 这里的边界自适应于不等于2^k的数组长度。math.ceil(x)返回大于等于参数x的最小整数,即对浮点数向上取整.
for (int i = 0; i < Math.ceil((tempLength + 0.0) / tempCurrentLength / 2); i++) {
// 块的界限
tempFirstStart = i * tempCurrentLength * 2; // 从零开始
tempSecondStart = tempFirstStart + tempCurrentLength; // 从第一块的第一个开始,+ 长度就会到达第二个开始点
tempSecondEnd = tempSecondStart + tempCurrentLength - 1; // 记录第二块结束位置,第二块开始+长度-1
if (tempSecondEnd >= tempLength) { // 如果第二次end 长度超过了数组的长度,下面一句进行回退。
tempSecondEnd = tempLength - 1;
} // Of if
// 对这个组进行归并,把开始的两个引用,复制给两次开始的索引,tempcurrentIndex也是第一次开始的引用
int tempFirstIndex = tempFirstStart;
int tempSecondIndex = tempSecondStart;
int tempCurrentIndex = tempFirstStart;
if (tempSecondStart >= tempLength) { // 只够一个块,直接复制进去
for (int j = tempFirstIndex; j < tempLength; j++) { // 就从第元素块开始,遍历
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
% 2][j]; // 把数组填满。
tempFirstIndex++;
tempCurrentIndex++;
} // Of for j
break;
} // Of if
while ((tempFirstIndex <= tempSecondStart - 1)// 如果第一个块的位置,没有和第二个块开始的位置衔接,
&& (tempSecondIndex <= tempSecondEnd)) { // 第二个块开始的位置没有和第二个块结束的位置衔接,那么执行迭代
if (paraArray[resultMatrix[tempIndex% 2][tempFirstIndex]] >= paraArray[resultMatrix[tempIndex% 2][tempSecondIndex]]) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][tempFirstIndex];
tempFirstIndex++;
} else {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][tempSecondIndex];
tempSecondIndex++; // 比较数组指针移动
} // Of if
tempCurrentIndex++; // 索引数组指针移动
} // Of while
// 剩余部分拷贝进去。
for (int j = tempFirstIndex; j < tempSecondStart; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][j];
tempCurrentIndex++;
} // Of for j
for (int j = tempSecondIndex; j <= tempSecondEnd; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex% 2][j];
tempCurrentIndex++;
} // Of for j
} // Of for i
tempCurrentLength *= 2;
tempIndex++;
} // Of while
System.out.println(Arrays.toString(resultMatrix));
return resultMatrix[tempIndex % 2];
}// Of mergeSortToIndices
/**
*********************
* 两个实例的欧式距离。为简单起见,不支持其他距离度量。
* @param paraI
* 第一个实例的索引
* @param paraJ
* 第二个实例的索引
* @return The distance.
*********************
*/
public double distance(int paraI, int paraJ) {
double resultDistance = 0;
double tempDifference;
for (int i = 0; i < dataset.numAttributes() - 1; i++) { // -1 减掉了决策属性。
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);// 向量相减
resultDistance += tempDifference * tempDifference;// 欧氏距离公式:√ (x1-x1)^2+...+(x_n - x_n)^2
} // Of for i
resultDistance = Math.sqrt(resultDistance);
return resultDistance;
}// Of distance
/**
**********************************
* 计算出最大距离, 结果存储在成员变量中。
**********************************
*/
public void computeMaximalDistance() {
maximalDistance = 0;
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) { // 对每一个变量进行循环,找出最大值
for (int j = 0; j < dataset.numInstances(); j++) {
tempDistance = distance(i, j);// 计算两点直接的距离
if (maximalDistance < tempDistance) {
maximalDistance = tempDistance;
} // Of if
} // Of for j
} // Of for i
System.out.println("maximalDistance = " + maximalDistance);
}// Of computeMaximalDistance
/**
******************
* 利用高斯核来计算密度。
*
* @param paraBlock
* The given block.
******************
*/
public void computeDensitiesGaussian() {
System.out.println("radius = " + radius);
densities = new double[dataset.numInstances()];// 实例的密度数组它的大小是实例数目
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < dataset.numInstances(); j++) {
tempDistance = distance(i, j);// 计算出两点之间的距离
densities[i] += Math.exp(-tempDistance * tempDistance / radius / radius);
} // Of for j
} // Of for i
System.out.println("The densities are " + Arrays.toString(densities) + "\r\n");
}// Of computeDensitiesGaussian
/**
**********************************
* 计算这个实例和大哥的距离
**********************************
*/
public void computeDistanceToMaster() {
distanceToMaster = new double[dataset.numInstances()]; // 到大哥之间的距离
masters = new int[dataset.numInstances()]; // 大哥数组
descendantDensities = new int[dataset.numInstances()]; // 降序排列的密度
instanceStatusArray = new int[dataset.numInstances()]; // 状态数组
descendantDensities = mergeSortToIndices(densities); // 对密度进行降序排列,然后返回索引
distanceToMaster[descendantDensities[0]] = maximalDistance; // 密度最大的点没有大哥,他就是爷(根节点)
double tempDistance;
for (int i = 1; i < dataset.numInstances(); i++) { // 从第二高的密度点去寻找自己的老大
// 初始化
distanceToMaster[descendantDensities[i]] = maximalDistance; // 初始化,和大哥的距离设置为最远
for (int j = 0; j <= i - 1; j++) { // 我们找大哥,一定要找密度比自己大的点,不然凭什么做我的大哥呢?
tempDistance = distance(descendantDensities[i], descendantDensities[j]); // 计算距离
if (distanceToMaster[descendantDensities[i]] > tempDistance) {
distanceToMaster[descendantDensities[i]] = tempDistance;
masters[descendantDensities[i]] = descendantDensities[j]; // 选择最近的距离作为我的大哥。
} // Of if
} // Of for j
} // Of for i
System.out.println("First compute, masters = " + Arrays.toString(masters));
System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
}// Of computeDistanceToMaster
/**
**********************************
* Compute priority. Element with higher priority is more likely to be
* selected as a cluster center. Now it is rho * distanceToMaster. It can
* also be rho^alpha * distanceToMaster.
**********************************
*/
public void computePriority() {
priority = new double[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
priority[i] = densities[i] * distanceToMaster[i];
} // Of for i
}// Of computePriority
/**
*************************
*一个块中的结点应该和老大一样.这个递归方法是高效的
*
*
* @param paraIndex
* 结点索引
* @return 当前结点的簇索引。
*************************
*/
public int coincideWithMaster(int paraIndex) {
if (clusterIndices[paraIndex] == -1) {
int tempMaster = masters[paraIndex];// 找出这个结点的老大的索引
clusterIndices[paraIndex] = coincideWithMaster(tempMaster);// 老大到追溯到顶
} // Of if
return clusterIndices[paraIndex]; // 返回这个实例的老大。
}// Of coincideWithMaster
/**
*************************
* 根据大佬,把这个块分成两个。
*
* @param paraBlock
* The given block.
* @return The new blocks where the two most represent instances serve as
* the root.
*************************
*/
public int[][] clusterInTwo(int[] paraBlock) {
// 初始化
// 聚类
Arrays.fill(clusterIndices, -1);// 开始这个块是空的
// 把这个块分成两个,前两个实例是密度最高的实例,最能成为子块的代表
for (int i = 0; i < 2; i++) {
clusterIndices[paraBlock[i]] = i;
} // Of for i // 两个子块成为老大,旗号 0 ,1
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] != -1) {
// 已经有所属关系了
continue;
} // Of if
clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
} // Of for i
// 子块
int[][] resultBlocks = new int[2][];
int tempFistBlockCount = 0;
for (int i = 0; i < clusterIndices.length; i++) {
if (clusterIndices[i] == 0) {
tempFistBlockCount++;
} // Of if
} // Of for i
resultBlocks[0] = new int[tempFistBlockCount];// 第一个子块
resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];// 第二个子块
// 把这个块的数据分配到两个子块中去。
//
int tempFirstIndex = 0;
int tempSecondIndex = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] == 0) {
resultBlocks[0][tempFirstIndex] = paraBlock[i];
tempFirstIndex++;
} else {
resultBlocks[1][tempSecondIndex] = paraBlock[i];
tempSecondIndex++;
} // Of if
} // Of for i
System.out.println("Split (" + paraBlock.length + ") instances "
+ Arrays.toString(paraBlock) + "\r\nto (" + resultBlocks[0].length + ") instances "
+ Arrays.toString(resultBlocks[0]) + "\r\nand (" + resultBlocks[1].length
+ ") instances " + Arrays.toString(resultBlocks[1]));
return resultBlocks;
}// Of clusterInTwo
/**
**********************************
* Classify instances in the block by simple voting.
*
* @param paraBlock
* The given block.
**********************************
*/
public void vote(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
} // Of if
} // Of for i
int tempMaxClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
tempMaxClass = i;
tempMaxCount = tempClassCounts[i];
} // Of if
} // Of for i
// Classify unprocessed instances.
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempMaxClass;
instanceStatusArray[paraBlock[i]] = 2;
} // Of if
} // Of for i
}// Of vote
/**
**********************************
* Cluster based active learning. Prepare for
*
* @param paraRatio
* The ratio of the maximal distance as the dc.
* @param paraMaxNumQuery
* The maximal number of queries for the whole dataset.
* @parm paraSmallBlockThreshold The small block threshold.
**********************************
*/
public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery,
int paraSmallBlockThreshold) {
radius = maximalDistance * paraRatio; // 原来这个比例就是乘以最大的距离 表示 dc
smallBlockThreshold = paraSmallBlockThreshold; // 一个块中的最小样本点个数
maxNumQuery = paraMaxNumQuery; // 查询的最大个数
predictedLabels = new int[dataset.numInstances()]; // 标签预测数组
for (int i = 0; i < dataset.numInstances(); i++) {
predictedLabels[i] = -1; // 对其进行初始化。
} // Of for i
computeDensitiesGaussian(); // 密度计算
computeDistanceToMaster(); // 到大哥之间的距离
computePriority(); // 优先级
descendantRepresentatives = mergeSortToIndices(priority); // 对优先级排序,是一个降序
System.out.println(
"descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));
numQuery = 0;
clusterBasedActiveLearning(descendantRepresentatives);
}// Of clusterBasedActiveLearning
/**
**********************************
* 基于主动学习进行聚类
*
* @param paraBlock
* The given block. This block must be sorted according to the
* priority in descendant order.
**********************************
*/
public void clusterBasedActiveLearning(int[] paraBlock) {
System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
// Step 1. 这个块可以查询的最大长度
int tempExpectedQueries = (int) Math.sqrt(paraBlock.length); // 这个块可以查询的最大长度
int tempNumQuery = 0;
for (int i = 0; i < paraBlock.length; i++) {// 统计查询过的数量
if (instanceStatusArray[paraBlock[i]] == 1) {
tempNumQuery++;
} // Of if
} // Of for i
// Step 2. 如果我们的查询数量超过预期用完了,这个块的数目小于最小块我们进行投票。
if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
System.out.println("" + tempNumQuery + " instances are queried, vote for block: \r\n"
+ Arrays.toString(paraBlock));
vote(paraBlock);
return;
} // Of if
// Step 3. 查询足够的标签
for (int i = 0; i < tempExpectedQueries; i++) {
if (numQuery >= maxNumQuery) {
System.out.println("No more queries are provided, numQuery = " + numQuery + ".");
vote(paraBlock);
return;
} // Of if
if (instanceStatusArray[paraBlock[i]] == 0) {
instanceStatusArray[paraBlock[i]] = 1;
predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
// System.out.println("Query #" + paraBlock[i] + ", numQuery = "
// + numQuery);
numQuery++;
} // Of if
} // Of for i
// Step 4. Pure?
int tempFirstLabel = predictedLabels[paraBlock[0]];
boolean tempPure = true;
for (int i = 1; i < tempExpectedQueries; i++) {
if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
tempPure = false;
break;
} // Of if
} // Of for i
if (tempPure) {
System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempFirstLabel;
instanceStatusArray[paraBlock[i]] = 2;
} // Of if
} // Of for i
return;
} // Of if
// Step 5. Split in two and process them independently.
int[][] tempBlocks = clusterInTwo(paraBlock);
for (int i = 0; i < 2; i++) {
// Attention: recursive invoking here.
clusterBasedActiveLearning(tempBlocks[i]);
} // Of for i
}// Of clusterBasedActiveLearning
/**
*******************
* Show the statistics information.
*******************
*/
public String toString() {
int[] tempStatusCounts = new int[3];
double tempCorrect = 0;
for (int i = 0; i < dataset.numInstances(); i++) {
tempStatusCounts[instanceStatusArray[i]]++;
if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
String resultString = "(unhandled, queried, classified) = "
+ Arrays.toString(tempStatusCounts);
resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = "
+ (tempCorrect / dataset.numInstances());
return resultString;
}// Of toString
/**
**********************************
* The entrance of the program.
*
* @param args:
* Not used now.
**********************************
*/
public static void main(String[] args) {
long tempStart = System.currentTimeMillis();
System.out.println("Starting ALEC.");
String arffFilename = "D:/data/iris.arff";
// String arffFilename = "D:/data/mushroom.arff";
Alec tempAlec = new Alec(arffFilename);
// The settings for iris
tempAlec.clusterBasedActiveLearning(0.15, 30, 3);
// The settings for mushroom
// tempAlec.clusterBasedActiveLearning(0.1, 800, 3);
System.out.println(tempAlec);
long tempEnd = System.currentTimeMillis();
System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
}// Of main
}// Of class Alec