随机森林+逻辑回归+贝叶斯等算法的组合使用

随机森林+逻辑回归+贝叶斯

1.GBDT的思想使其具有天然优势可以发现多种有区分性的特征以及特征组合。使用其来自动发现有效的特征、特征组合,来作为LR模型中的特征,以提高 CTR预估(Click-Through Rate Prediction)的准确性

2.这个程序主要是为了提升特征的准确性,筛选有效特征。其次是锻炼混合算法的使用,避免单一算法的局限性。大家可以按照路子,随意修改或者组合自己想要用的算法,提升准确度。

3.Normalizer() 正则化也是规范特征的一种常用的方式

4.提升算法的准确度有两个出发点:

      a、算法本身的选择、组合、优化。

      b、数据的处理、特征的处理、数据的准确性等

 

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification._
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession

object GBDT_test01 {
  def main(args: Array[String]) {
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    val spark = SparkSession
      .builder
      .appName("logistic_regression06")
      .master("local[4]")
      .getOrCreate()
    import spark.implicits._
    //val inputdata01 = args(0).toString
    //val outputdata = args(1).toString

    //数据转df
    val input_reco_theme = spark.read.textFile(inputdata01)
    val reco_theme = input_reco_theme.rdd.map(x => {
            //XXX
      }
    }).toDF()
    reco_theme.show()

    reco_theme.createOrReplaceTempView("input_data")
    val result_valuse = spark.sql(
      """
        |SELECT //N个维度,label
        |from (
        |    SELECT *,
        |        row_number() over (partition BY table1.label order by table1.rst_0) as rank
        |    from (//没啥用的子查询) table1
        |) table2
        |WHERE table2.rank <= 10000
      """.stripMargin)
    val ratings2 = result_valuse.toDF()
    ratings2.show()

    //StringIndexer 将字符串映射int
    val converter_pipeline = new Pipeline().setStages(Array(
      new StringIndexer()
        .setInputCol("xaid").setOutputCol("xaidIndex"),
      new StringIndexer()
        .setInputCol("brand").setOutputCol("brandIndex"),
      new StringIndexer()
        .setInputCol("model").setOutputCol("modelIndex")
    ))
    def getConverterPipline: Pipeline = converter_pipeline
    val cluster_info_split_table = getConverterPipline.fit(ratings2).transform(ratings2)
    cluster_info_split_table.show()

    val df1 = cluster_info_split_table.select(//N个维度)
    df1.show()

    //df转成features-label格式
    val assembler: VectorAssembler = new VectorAssembler()
      .setInputCols(Array(//N个维度))
      .setOutputCol("features")
    val df2 = assembler.transform(df1).select("xaidIndex","label","features")
    df2.limit(10).rdd.foreach(println)
    //正则化
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normFeatures").setP(1.0)
    val l1NormData = normalizer.transform(df2)
    l1NormData.show()

     //Split the data into training and test sets (30% held out for testing).
    val Array(trainingData, testData) = l1NormData.randomSplit(Array(0.7, 0.3))

    // Train a GBT model.
    val rf = new RandomForestClassifier()
      .setLabelCol("label")
      .setFeaturesCol("normFeatures")
      .setNumTrees(9)

    // Train model. This also runs the indexers.
    val model = rf.fit(trainingData)
    val predictions = model.transform(testData)
    predictions.show()

    // Select (prediction, true label) and compute test error.
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println(s"Test Error = ${1.0 - accuracy}")
    val s=model.featureImportances.toArray.toSeq
    var result_for= for(i<- 0 to s.length-1; if(s(i)!=0) )yield i
    val slicer = new VectorSlicer().setInputCol("features").setOutputCol("features_select")
      .setIndices(result_for.toArray)
    val trainingData_new=slicer.transform(trainingData)
    trainingData_new.printSchema()
    trainingData_new.show()

    val test_new=slicer.transform(testData)
    var labmda=0
    var gbgt_lr = new LogisticRegression()
      .setFeaturesCol("features_select")
      .setLabelCol("label")
      .setMaxIter(100)
      .setStandardization(false)
      .setRegParam(labmda)
      .setElasticNetParam(1)
    var modellr_gbgt = gbgt_lr.fit(trainingData_new)

    var trainpredictionsd = modellr_gbgt.transform(trainingData_new)
    trainpredictionsd.show()

    val trainingSummary = modellr_gbgt.summary
    val objectiveHistory = trainingSummary.objectiveHistory
    println("objectiveHistory:")
    objectiveHistory.foreach(println)

    val accuracy2 = trainingSummary.accuracy//精准度
    println(s"Accuracy: $accuracy2")


    val model2 = new NaiveBayes()
      .setFeaturesCol("features_select")
      .setLabelCol("label")
      .setSmoothing(0.01)
      .fit(trainingData_new)
    // Select example rows to display.
    val predictions2 = model2.transform(test_new)
    predictions2.show()

    // Select (prediction, true label) and compute test error
    val evaluator2 = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy3 = evaluator2.evaluate(predictions)
    println(s"Test set accuracy = $accuracy3")

  }
}

大数据、数据分析、爬虫群: 《453908562》

你可能感兴趣的:(数据分析)