2.团块厚度1 - 10
3.细胞大小的均匀性1 - 10
4.细胞形状的均匀性1 - 10
5.边际粘合力1 - 10
6.单个上皮细胞大小1 - 10
7.裸核1 - 10
8.布兰德染色质1 - 10
9.普通核仁1 - 10
10.有丝分裂1 - 10
11.类:( 2为良性,4为恶性)
代码首先创建sparkSession,用于读取数据,初始化 new IForest(),然后开始fit训练数据
object IForestExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.master("local") // test in local mode
.appName("iforest example")
val startTime = System.currentTimeMillis()
// Dataset from
val dataset ="inferSchema", "true")
// Index label values: 2 -> 0, 4 -> 1
val indexer = new StringIndexer()
val assembler = new VectorAssembler()
// assembler.setInputCols(Array("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9"))
assembler.setInputCols(Array("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9"))
val iForest = new IForest()
val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest))
val model =
val predictions = model.transform(dataset)
val binaryMetrics = new BinaryClassificationMetrics("prediction", "label") {
case Row(label: Double, ground: Double) => (label, ground)
val endTime = System.currentTimeMillis()
println(s"Training and predicting time: ${(endTime - startTime) / 1000} seconds.")
println(s"The model's auc: ${binaryMetrics.areaUnderROC()}")
在val predictions = model.transform(dataset) 中进行预测,最后进行预测的代码在iForest中执行。
override def transform(dataset: Dataset[_]): DataFrame = {
transformSchema(dataset.schema, logging = true)
val numSamples = dataset.count() //列数
val possibleMaxSamples =
if ($(maxSamples) > 1.0) $(maxSamples) else ($(maxSamples) * numSamples)
val bcastModel = dataset.sparkSession.sparkContext.broadcast(this)
// calculate anomaly score
val scoreUDF = udf { (features: Vector) => { //一行
val normFactor = avgLength(possibleMaxSamples)
val avgPathLength = bcastModel.value.calAvgPathLength(features)
Math.pow(2, -avgPathLength / normFactor)
// append a score column
val scoreDataset = dataset.withColumn($(anomalyScoreCol), scoreUDF(col($(featuresCol))))
// get threshold value
val threshold = scoreDataset.stat.approxQuantile($(anomalyScoreCol),
Array(1 - $(contamination)), 0)
// set anomaly instance label 1
val predictUDF = udf { (anomalyScore: Double) =>
if (anomalyScore >= threshold(0)) 1.0 else 0.0
scoreDataset.withColumn($(predictionCol), predictUDF(col($(anomalyScoreCol))))
* Calculate an average path length for a given feature set in a forest.
* @param features A Vector stores feature values.
* @return Average path length.
private def calAvgPathLength(features: Vector): Double = {
val avgPathLength = => { //100棵树,进行map,ifNode=iTree
calPathLength(features, ifNode, 0)
}).sum / trees.length
* Calculate a path langth for a given feature set in a tree.
* @param features A Vector stores feature values.
* @param ifNode Tree's root node.
* @param currentPathLength Current path length.
* @return Path length in this tree.
private def calPathLength(features: Vector,
ifNode: IFNode,
currentPathLength: Int): Double = ifNode match {
case leafNode: IFLeafNode => currentPathLength + avgLength(leafNode.numInstance)
case internalNode: IFInternalNode =>
val attrIndex = internalNode.featureIndex
if (features(attrIndex) < internalNode.featureValue) {
calPathLength(features, internalNode.leftChild, currentPathLength + 1)
} else {
calPathLength(features, internalNode.rightChild, currentPathLength + 1)
* A function to calculate an expected path length with a specific data size.
* @param size Data size.
* @return An expected path length.
private def avgLength(size: Double): Double = {
if (size > 2) {
val H = Math.log(size - 1) + EulerConstant
2 * H - 2 * (size - 1) / size
else if (size == 2) 1.0
else 0.0