SCALA下的GBDT与LR融合实现

我们直接使用的ML的包对GBDT/LR进行融合
首先我们需要导入的包如下所示:

import org.apache.spark.sql. Row
import scala.collection.mutable
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, FeatureType, Strategy}
import org.apache.spark.mllib.tree.model.Node
import com.suning.aps.util.handle_data.deleteHDFS
import com.suning.aps.utils.StringUtil.getSign
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.types._

其次,由于ml的数据需要LabeledPoint格式,而我们从hive里面提取的数据格式是DataFrame ,所以我们需要做数据转化

   val spark = SparkSession.builder().appName("wap_cpc_searchApp_v")
      .enableHiveSupport().getOrCreate()
    import spark.implicits._
    val instance = "yuntu/cpc_ctr/hotmai_02&20"
    val statis_day = "20191207"
    val statis_day_before = "20191205"
    val parallels =50

    deleteHDFS(instance + "/wap_cpc_searchApp_v/" + statis_day, spark.sparkContext)

    val pv_click_searchApp1 = spark.sql(
      s"""select
         |case when length(ta.query)=0 then '-' else ta.query end as query,
         |case when length(ta.ideaid)=0 then '00000000000000000-'
         |     when ta.ideaid is NULL then '00000000000000000-'
         |     else lpad(ta.ideaid,18,'0') end as ideaid,
         |case when length(ta.terminal)=0 then '-' else ta.terminal end as terminal,

       """.stripMargin)
      .select("is_click","query","terminal","query_brand_name","query_third_categ","userid"
        ,"ideaid","idea_first_categ","idea_second_categ",
        "idea_third_categ","idea_brand_name").rdd.map(l=>{Row(
      l(0).toString.toDouble, getSign(l(1).toString).toString.toDouble, getSign(l(3).toString).toString.toDouble,
      getSign(l(4).toString).toString.toDouble,
        getSign(l(5).toString).toString.toDouble,
        getSign(l(6).toString).toString.toDouble,
        getSign(l(7).toString).toString.toDouble,
        getSign(l(8).toString).toString.toDouble,
        getSign(l(9).toString).toString.toDouble,
        getSign(l(10).toString).toString.toDouble)})
//由于使用MAP转化了数据(原先数据很多是离散变量,我们需要进行哈希编码)所以我们先转化为DATAFRAME
 val ScoreSchema=StructType(mutable.ArraySeq(
              StructField("y",DoubleType,nullable=false),
              StructField("x1",DoubleType,nullable=false),StructField("x2",DoubleType,nullable=false),
              StructField("x3",DoubleType,nullable=false),StructField("x4",DoubleType,nullable=false),
              StructField("x5",DoubleType,nullable=false),StructField("x6",DoubleType,nullable=false),
              StructField("x7",DoubleType,nullable=false),StructField("x8",DoubleType,nullable=false),
              StructField("x9",DoubleType,nullable=false)
            ))
            val df=spark.createDataFrame(pv_click_searchApp1,ScoreSchema)
    val ignored = List("y")
    val featInd = df.columns.diff(ignored).map(df.columns.indexOf(_))

    // Get index of target
    val targetInd = df.columns.indexOf("y")

    val ds=df.rdd.map(r => LabeledPoint(
      r.getDouble(targetInd), // Get target value
      // Map feature indices to values
      Vectors.dense(featInd.map(r.getDouble(_)).toArray)
    ))


上文DS即为转化后的数据,下文将介绍GBDT

//训练GBDT
    val numTrees = 2
                // Train a GradientBoostedTrees model.
                val boostingStrategy = BoostingStrategy.defaultParams("Classification")
                boostingStrategy.numIterations = 10
                boostingStrategy.treeStrategy.numClasses = 2
                boostingStrategy.treeStrategy.maxDepth = 3
                boostingStrategy.learningRate = 0.3
                boostingStrategy.setNumIterations(numTrees)
                // Empty categoricalFeaturesInfo indicates all features are continuous.
                boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

              val model = GradientBoostedTrees.train(ds, boostingStrategy)

训练完GBDT后,我们需要从GBDT中探查树模型的结构,如下

    val treeLeafArray = new Array[Array[Int]](numTrees)

    for (i <- 0.until(numTrees)) {
      treeLeafArray(i) = getLeafNodes(model.trees(i).topNode)
    }
    for (i <- 0.until(numTrees)) {
      println("正在打印第%d 棵树的 topnode 叶子节点", i)
      for (j <- 0.until(treeLeafArray(i).length)) {
        println(j)
      }

    }

其中,我们定义了两个函数,如下所示:

  def getLeafNodes(node: Node): Array[Int] = {
    var treeLeafNodes = new Array[Int](0)
    if (node.isLeaf) {
      treeLeafNodes = treeLeafNodes.:+(node.id)
    } else {
      treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.leftNode.get)
      treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.rightNode.get)
    }
    treeLeafNodes
  }

  // predict decision tree leaf's node value
  def predictModify(node: Node, features: DenseVector): Int = {
    val split = node.split
    if (node.isLeaf) {
      node.id
    } else {
      if (split.get.featureType == FeatureType.Continuous) {
        if (features(split.get.feature) <= split.get.threshold) {
          //          println("Continuous left node")
          predictModify(node.leftNode.get, features)
        } else {
          //          println("Continuous right node")
          predictModify(node.rightNode.get, features)
        }
      } else {
        if (split.get.categories.contains(features(split.get.feature))) {
          //          println("Categorical left node")
          predictModify(node.leftNode.get, features)
        } else {
          //          println("Categorical right node")
          predictModify(node.rightNode.get, features)
        }
      }
    }
  }

下文将利用上文训练的GBDT构造新特征并用LR训练

    //    gbdt 构造新特征
    val newFeatureDataSet = df.rdd.map { x =>
      (x(0).toString().toDouble, new DenseVector(Array(x(1).toString().toDouble, x(2).toString().toDouble, x(3).toString().toDouble,
        x(4).toString().toDouble, x(5).toString().toDouble, x(6).toString().toDouble
        , x(7).toString().toDouble, x(8).toString().toDouble, x(9).toString().toDouble)))
    }.map { x =>
      var newFeature = new Array[Double](0)
      for (i <- 0.until(numTrees)) {
        val treePredict = predictModify(model.trees(i).topNode, x._2)
        //gbdt tree is binary tree
        val treeArray = new Array[Double]((model.trees(i).numNodes + 1) / 2)
        treeArray(treeLeafArray(i).indexOf(treePredict)) = 1
        newFeature = newFeature ++ treeArray
      }
      (x._1, newFeature)
    }


    newFeatureDataSet.take(2).foreach(println)



        val newData = newFeatureDataSet.map(x => LabeledPoint(x._1, new DenseVector(x._2)))

        newData.take(2).foreach(println)
    val splits2 = newData.randomSplit(Array(0.8, 0.2))
    val train2 = splits2(0)
    val test2 = splits2(1)

        var predictions = ds.map(lp => model.predict(lp.features))
        predictions.take(10).foreach(println)
        var predictionAndLabel = predictions.zip( ds.map(_.label))
        var accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2 ).count/ds.count
        println("GBTR accuracy " + accuracy)

    val model1 = new LogisticRegressionWithLBFGS().setNumClasses(2).run(train2).setThreshold(0.01)
    model1.weights
    val predictionAndLabels = test2.map { case LabeledPoint(label, features) =>
      val prediction = model1.predict(features)
      (prediction, label)
    }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val precision = metrics.accuracy
    println("Precision = " + precision)

你可能感兴趣的:(spark,机器学习)