二分类模型评价指标-Scala实现

本文主要使用Scala计算二分类模型的评价指标,包括以下内容:

  • precision、recall、F1Score
  • accuracy
  • AUC
  • KS

对上述指标计算方法进行封装,方便调用。传入参数为预测的数据框

构造数据

简单的构造数据,得到预测的DataFrame,其包含预测的概率、label和真实的label。

import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
      .builder()
      .appName("LR")
      .config("spark.executor.heartbeatInterval","60s")
      .config("spark.network.timeout","120s")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.kryoserializer.buffer.max","512m")
      .config("spark.dynamicAllocation.enabled", false)
      .config("spark.sql.inMemoryColumnarStorage.compressed", true)
      .config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
      .config("spark.sql.broadcastTimeout", 600)
      .config("spark.sql.autoBroadcastJoinThreshold", -1)
      .config("spark.sql.crossJoin.enabled", true)
      .master("local[*]") 
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@20fd3d0a
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@58767892
import spark.implicits._
var dfTrain = Seq(
    (1, 5.1, 3.5, 1.4, 0.2, 0),
    (2, 4.9, 3.0, 1.4, 0.2, 1),
    (3, 4.7, 3.2, 1.3, 0.2, 0),
    (4, 4.6, 3.1, 1.5, 0.2, 1),
    (5, 5.0, 3.6, 1.4, 0.2, 0),
    (56, 5.7, 2.8, 4.5, 1.3,1),
    (57, 5.3, 3.3, 4.7, 1.6,0),
    (58, 4.9, 2.4, 3.3, 1.0,1),
    (59, 6.6, 3.9, 4.6, 1.3,1),
    (60, 5.2, 2.7, 3.9, 1.4,0)
  ).toDF("id","x1","x2", "x3","x4","label")
// 测试集直接copy就行了,仅用来测试
var dfTest = dfTrain
dfTrain.show()
+---+---+---+---+---+-----+
| id| x1| x2| x3| x4|label|
+---+---+---+---+---+-----+
|  1|5.1|3.5|1.4|0.2|    0|
|  2|4.9|3.0|1.4|0.2|    1|
|  3|4.7|3.2|1.3|0.2|    0|
|  4|4.6|3.1|1.5|0.2|    1|
|  5|5.0|3.6|1.4|0.2|    0|
| 56|5.7|2.8|4.5|1.3|    1|
| 57|5.3|3.3|4.7|1.6|    0|
| 58|4.9|2.4|3.3|1.0|    1|
| 59|6.6|3.9|4.6|1.3|    1|
| 60|5.2|2.7|3.9|1.4|    0|
+---+---+---+---+---+-----+






dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
// 数据转换
val assemble = new VectorAssembler()
      .setInputCols(Array("x1","x2","x3","x4"))
      .setOutputCol("features")
// 模型 
val lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01)
            .setLabelCol("label")//设置训练集真实label列名,默认也是"label"
            .setFeaturesCol("features")//设置训练集特征列,默认"features"
            .setPredictionCol("preLabel")//设置预测结果中label列名,默认为"prediction"
            .setProbabilityCol("prob")//设置预测结果中概率列名,默认"probability"
// 模型
val pipeline = new Pipeline().setStages(Array(assemble, lr))
val Model = pipeline.fit(dfTrain)
assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_2bfbc5f2ff24
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_6fd6be208198
pipeline: org.apache.spark.ml.Pipeline = pipeline_5b2f95daec89
Model: org.apache.spark.ml.PipelineModel = pipeline_5b2f95daec89
val preResult = Model.transform(dfTest)
preResult.show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
| id| x1| x2| x3| x4|label|         features|       rawPrediction|                prob|preLabel|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
|  1|5.1|3.5|1.4|0.2|    0|[5.1,3.5,1.4,0.2]|[0.51973594549227...|[0.62708601946512...|     0.0|
|  2|4.9|3.0|1.4|0.2|    1|[4.9,3.0,1.4,0.2]|[-1.0899529556434...|[0.25162713725554...|     1.0|
|  3|4.7|3.2|1.3|0.2|    0|[4.7,3.2,1.3,0.2]|[0.41832944562126...|[0.60308343184906...|     0.0|
|  4|4.6|3.1|1.5|0.2|    1|[4.6,3.1,1.5,0.2]|[0.24687940631850...|[0.56140826798745...|     0.0|
|  5|5.0|3.6|1.4|0.2|    0|[5.0,3.6,1.4,0.2]|[1.26603211145541...|[0.78006275423495...|     0.0|
| 56|5.7|2.8|4.5|1.3|    1|[5.7,2.8,4.5,1.3]|[-2.0093807897371...|[0.11822151224039...|     1.0|
| 57|5.3|3.3|4.7|1.6|    0|[5.3,3.3,4.7,1.6]|[2.17258007146063...|[0.89776002662622...|     0.0|
| 58|4.9|2.4|3.3|1.0|    1|[4.9,2.4,3.3,1.0]|[-1.9539003322336...|[0.12412868907566...|     1.0|
| 59|6.6|3.9|4.6|1.3|    1|[6.6,3.9,4.6,1.3]|[0.12192431314750...|[0.53044337453190...|     0.0|
| 60|5.2|2.7|3.9|1.4|    0|[5.2,2.7,3.9,1.4]|[-0.5811999760827...|[0.35865652594949...|     1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+






preResult: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]

可以看到预测结果是数据集,包含概率和label(probability,prediction),以及真实的label

混淆矩阵

import spark.implicits._
val preLabel ="preLabel"
val preProb = "prob"
val trueLabel = "label"
val PredictDf = preResult
preResult.filter(s"$preLabel==1").show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
| id| x1| x2| x3| x4|label|         features|       rawPrediction|                prob|preLabel|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
|  2|4.9|3.0|1.4|0.2|    1|[4.9,3.0,1.4,0.2]|[-1.0899529556434...|[0.25162713725554...|     1.0|
| 56|5.7|2.8|4.5|1.3|    1|[5.7,2.8,4.5,1.3]|[-2.0093807897371...|[0.11822151224039...|     1.0|
| 58|4.9|2.4|3.3|1.0|    1|[4.9,2.4,3.3,1.0]|[-1.9539003322336...|[0.12412868907566...|     1.0|
| 60|5.2|2.7|3.9|1.4|    0|[5.2,2.7,3.9,1.4]|[-0.5811999760827...|[0.35865652594949...|     1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+






import spark.implicits._
preLabel: String = preLabel
preProb: String = prob
trueLabel: String = label
PredictDf: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
// import spark.implicits._
val preLabel ="preLabel"
val preProb = "prob"
val trueLabel = "label"
val PredictDf = preResult
// --- 统计TP、FP、FN、TN
// 实际为正,预测为正
val TP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 1").count().toDouble
// 实际为负,预测为正
val FP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 0").count().toDouble
// 实际为负,预测为负
val TN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 0").count().toDouble
// 实际为正,预测为负
val FN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 1").count().toDouble
// 自己拼的混淆矩阵,中文jupyter有bug就不写了。。。
println("\t " + "Pre Neg " + "Pre Pos " + "\n" +
      "True Neg " + TN + "     " + FP + "\n"+
      "True Pos " + FN + "     " + TP + "\n" 
      
    )
// 直接groupBy
PredictDf.groupBy(s"$trueLabel")
      .pivot(s"$preLabel", (0 to 1)).count().na.fill(0.0)
      .orderBy(asc(s"$trueLabel"))
      .withColumnRenamed(s"$trueLabel", "True-Predict")
      .show(truncate = true)
	 Pre Neg Pre Pos 
True Neg 4.0     1.0
True Pos 2.0     3.0

+------------+---+---+
|True-Predict|  0|  1|
+------------+---+---+
|           0|  4|  1|
|           1|  2|  3|
+------------+---+---+






preLabel: String = preLabel
preProb: String = prob
trueLabel: String = label
PredictDf: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
TP: Double = 3.0
FP: Double = 1.0
TN: Double = 4.0
FN: Double = 2.0

accuarcy&precision&recall&f1

混淆矩阵都有了,precision、recall、f1Score都很好计算

// 正样本
println("正样本precision: " + TP / (TP + FP))
println("正样本recall: " + TP/(TP+FN))
println("正样本F1Score: " + 2*TP/(2*TP+FP+FN))
// 负样本
println("负样本precision: " + TN / (TN + FN))
println("负样本recall: " + TN/(FP+TN))
println("负样本F1Score: " + 2*TN/(2*TN+FP+FN))
println("Accuaracy: "+(TP+TN)/(TP+TN+FP+FN))
// 负样本
println("Neg precision: " + TN / (TN + FN))
println("Neg recall: " + TN/(FP+TN))
println("Neg F1Score: " + 2*TN/(2*TN+FP+FN))
// 正样本
println("Pos precision: " + TP / (TP + FP))
println("Pos recall: " + TP/(TP+FN))
println("Pos F1Score: " + 2*TP/(2*TP+FP+FN))

Accuaracy: 0.7
Neg precision: 0.6666666666666666
Neg recall: 0.8
Neg F1Score: 0.7272727272727273
Pos precision: 0.75
Pos recall: 0.6
Pos F1Score: 0.6666666666666666

也可以直接掉包计算

import org.apache.spark.mllib.evaluation.MulticlassMetrics
val predictionRDD = PredictDf.select(preLabel, trueLabel).as[(Double, Double)].rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println("Accuaracy: "+multiclassMetrics.accuracy)
val labels = multiclassMetrics.labels
    labels.foreach { l =>
    println(s"Precision($l) = " + multiclassMetrics.precision(l))
    println(s"Recall($l) = " + multiclassMetrics.recall(l))
    println(s"F1Score($l) = " + multiclassMetrics.fMeasure(l))    
}
Accuaracy: 0.7
Precision(0.0) = 0.6666666666666666
Recall(0.0) = 0.8
F1Score(0.0) = 0.7272727272727272
Precision(1.0) = 0.75
Recall(1.0) = 0.6
F1Score(1.0) = 0.6666666666666665





import org.apache.spark.mllib.evaluation.MulticlassMetrics
predictionRDD: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1160] at rdd at :89
multiclassMetrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@28b99672
labels: Array[Double] = Array(0.0, 1.0)

AUC和PRC

scala中提供了计算AUC和PRC的方法,但是需要传入rdd格式的数据。另外需要对数据概率字段进行切分,原始字段中是Vector。先看下数据切分的代码:

val aucDf = PredictDf.select(preProb, trueLabel).map(x => (
        x(0).asInstanceOf[Vector].toArray, x(0).asInstanceOf[Vector].toArray(0), x(0).asInstanceOf[Vector].toArray(1)
        , x(1).toString.toDouble))
      .toDF("probVector","prob0","prob1",trueLabel)
aucDf.show()
+--------------------+-------------------+-------------------+-----+
|          probVector|              prob0|              prob1|label|
+--------------------+-------------------+-------------------+-----+
|[0.62708601946512...| 0.6270860194651235| 0.3729139805348764|  0.0|
|[0.25162713725554...| 0.2516271372555436| 0.7483728627444565|  1.0|
|[0.60308343184906...| 0.6030834318490638| 0.3969165681509361|  0.0|
|[0.56140826798745...|  0.561408267987451| 0.4385917320125489|  1.0|
|[0.78006275423495...|  0.780062754234951|0.21993724576504908|  0.0|
|[0.11822151224039...|0.11822151224039172| 0.8817784877596083|  1.0|
|[0.89776002662622...| 0.8977600266262256|0.10223997337377445|  0.0|
|[0.12412868907566...|0.12412868907566986| 0.8758713109243301|  1.0|
|[0.53044337453190...| 0.5304433745319013|0.46955662546809873|  1.0|
|[0.35865652594949...| 0.3586565259494905| 0.6413434740505095|  0.0|
+--------------------+-------------------+-------------------+-----+






aucDf: org.apache.spark.sql.DataFrame = [probVector: array, prob0: double ... 2 more fields]
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
val predictionRDD1 = aucDf.select("prob1", trueLabel).as[(Double, Double)].rdd
val BinaryMetrics = new BinaryClassificationMetrics(predictionRDD1)
println("AUC: " + BinaryMetrics.areaUnderROC)
println("PRC: " + BinaryMetrics.areaUnderPR)
AUC: 0.92
PRC: 0.9183333333333333





import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
predictionRDD1: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1127] at rdd at :86
BinaryMetrics: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@12535044

KS

简单介绍KS计算的逻辑:

  • 阈值以0.1为单位,从0递增至1
  • 根据不同的阈值分别计算TPR、FPR
  • kS = Max(TPR-FPR)
(0.0 to 1.0 by 0.1).toArray
res102: Array[Double] = Array(0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0)
    aucDf.filter(s"prob1 >= 0.1 and $trueLabel == 1").count().toDouble
res104: Double = 5.0
import scala.collection.mutable.ArrayBuffer
val Tpr_Fpr = ArrayBuffer[Double]()
var tp=0.0
var fp=0.0
var tn=0.0
var fn=0.0
for(threshold <- 0.0 to 1.0 by 0.1){
  //判为正类实际也为正类
  tp =  aucDf.filter(s"prob1 >= $threshold and $trueLabel == 1").count().toDouble
  //判为正类实际为负类
  fp =  aucDf.filter(s"prob1 >= $threshold and $trueLabel == 0").count().toDouble
  //判为负类实际为负类
  tn =  aucDf.filter(s"prob1 < $threshold and $trueLabel == 0").count().toDouble
  //判为负类实际为正类
  fn =  aucDf.filter(s"prob1 < $threshold and $trueLabel == 1").count().toDouble
  Tpr_Fpr.append(tp/(tp+fn)-fp/(fp+tn))
}
println("KS Value: "+Tpr_Fpr.max)
KS Value: 0.8





import scala.collection.mutable.ArrayBuffer
Tpr_Fpr: scala.collection.mutable.ArrayBuffer[Double] = ArrayBuffer(0.0, 0.0, 0.19999999999999996, 0.4, 0.8, 0.39999999999999997, 0.39999999999999997, 0.6, 0.4, 0.0, 0.0)
tp: Double = 0.0
fp: Double = 0.0
tn: Double = 5.0
fn: Double = 5.0

封装

把上面所有的指标封装起来,方便调用。指标不是很多,调用方法是print所有的指标

 /**
   *
   * @param spark
   * @param PredictDf 原始的预测数据集,不用对probVector做split
   * @param preLabel  预测的label列名,prediction
   * @param trueLabel 真实label列名,默认label
   */
  def BinaryClassificationModelMectrics3(spark: SparkSession, PredictDf: DataFrame,probName:String="probability"
                                         , preLabel: String = "prediction", trueLabel: String = "label"): Unit = {
    import spark.implicits._
    println("--------------------------------------- Confusion Matrix ------------------------------------------------")
    // --- 统计TP、FP、FN、TN
    // 实际为正,预测为正
    val TP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 1").count().toDouble
    // 实际为负,预测为正
    val FP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 0").count().toDouble
    // 实际为负,预测为负
    val TN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 0").count().toDouble
    // 实际为正,预测为负
    val FN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 1").count().toDouble
    // 自己拼的混淆矩阵,中文jupyter有bug就不写了。。。
    println("\t " + "Pre Neg " + "Pre Pos " + "\n" +
      "True Neg " + TN + "     " + FP + "\n" +
      "True Pos " + FN + "     " + TP + "\n")
    // 直接groupBy
    PredictDf.groupBy(s"$trueLabel")
      .pivot(s"$preLabel", (0 to 1)).count().na.fill(0.0)
      .orderBy(asc(s"$trueLabel"))
      .withColumnRenamed(s"$trueLabel", "True-Predict")
      .show(truncate = true)
    //------ 计算accuracy、recall、precision、f1score
    println("---------------------------------- Accuarcy&Precision&Recall&F1Score ------------------------------------")
    println("---------------------- Use Package")
    val predictionRDD = PredictDf.select(preLabel, trueLabel).as[(Double, Double)].rdd
    val multiclassMetrics = new MulticlassMetrics(predictionRDD)
    println("Accuaracy: " + multiclassMetrics.accuracy)
    val labels = multiclassMetrics.labels
    labels.foreach { l =>
      println(s"Precision($l) = " + multiclassMetrics.precision(l))
      println(s"Recall($l) = " + multiclassMetrics.recall(l))
      println(s"F1Score($l) = " + multiclassMetrics.fMeasure(l))
    }
    println("---------------------- Not Use Package")
    println("Accuaracy: " + (TP + TN) / (TP + TN + FP + FN))
    // 负样本
    println("Neg precision: " + TN / (TN + FN))
    println("Neg recall: " + TN / (FP + TN))
    println("Neg F1Score: " + 2 * TN / (2 * TN + FP + FN))
    // 正样本
    println("Pos precision: " + TP / (TP + FP))
    println("Pos recall: " + TP / (TP + FN))
    println("Pos F1Score: " + 2 * TP / (2 * TP + FP + FN))

    println("-----------------------------*----------- Auc&Prc&Ks ----------------------------------------------------")
    //---- 计算auc prc
    val aucDf = PredictDf.select(probName, trueLabel).map(x => (
      x(0).asInstanceOf[Vector].toArray, x(0).asInstanceOf[Vector].toArray(0), x(0).asInstanceOf[Vector].toArray(1)
      , x(1).toString.toDouble))
      .toDF("probVector","prob0","prob1",trueLabel)
    val predictionRDD1 = aucDf.select("prob1", trueLabel).as[(Double, Double)].rdd
    val BinaryMetrics = new BinaryClassificationMetrics(predictionRDD1)
    println("AUC: " + BinaryMetrics.areaUnderROC)
    println("PRC: " + BinaryMetrics.areaUnderPR)
    // ---KS

    val Tpr_Fpr = ArrayBuffer[Double]()
    var tp=0.0
    var fp=0.0
    var tn=0.0
    var fn=0.0
    for(threshold <- 0.0 to 1.0 by 0.1){
      //判为正类实际也为正类
      tp =  aucDf.filter(s"prob1 >= $threshold and $trueLabel == 1").count().toDouble
      //判为正类实际为负类
      fp =  aucDf.filter(s"prob1 >= $threshold and $trueLabel == 0").count().toDouble
      //判为负类实际为负类
      tn =  aucDf.filter(s"prob1 < $threshold and $trueLabel == 0").count().toDouble
      //判为负类实际为正类
      fn =  aucDf.filter(s"prob1 < $threshold and $trueLabel == 1").count().toDouble
      Tpr_Fpr.append(tp/(tp+fn)-fp/(fp+tn))
    }
    println("KS Value: "+Tpr_Fpr.max)
  }
BinaryClassificationModelMectrics3: (spark: org.apache.spark.sql.SparkSession, PredictDf: org.apache.spark.sql.DataFrame, probName: String, preLabel: String, trueLabel: String)Unit
BinaryClassificationModelMectrics3(spark=spark, PredictDf=PredictDf,probName="prob"
                                         , preLabel = "preLabel", trueLabel= "label")

--------------------------------------- Confusion Matrix ------------------------------------------------
	 Pre Neg Pre Pos 
True Neg 4.0     1.0
True Pos 2.0     3.0

+------------+---+---+
|True-Predict|  0|  1|
+------------+---+---+
|           0|  4|  1|
|           1|  2|  3|
+------------+---+---+

---------------------------------- Accuarcy&Precision&Recall&F1Score ------------------------------------
---------------------- Use Package
Accuaracy: 0.7
Precision(0.0) = 0.6666666666666666
Recall(0.0) = 0.8
F1Score(0.0) = 0.7272727272727272
Precision(1.0) = 0.75
Recall(1.0) = 0.6
F1Score(1.0) = 0.6666666666666665
---------------------- Not Use Package
Accuaracy: 0.7
Neg precision: 0.6666666666666666
Neg recall: 0.8
Neg F1Score: 0.7272727272727273
Pos precision: 0.75
Pos recall: 0.6
Pos F1Score: 0.6666666666666666
-----------------------------*----------- Auc&Prc&Ks ----------------------------------------------------
AUC: 0.92
PRC: 0.9183333333333332
KS Value: 0.8

                                2020-03-25 于南京市江宁区九龙湖

你可能感兴趣的:(★★★机器学习,#,★★模型选择和评估)