【Spark】MLlib mark a demo(四)

标记一个DEMO以便于后面复习

DEMO背景阿里天池竞赛系列 口碑商家客流量预测

第四步使用随机森林回归算法

package com.huadian.bigdata.ijcai
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.{DecisionTreeModel, RandomForestModel}
import org.apache.spark.mllib.tree.{DecisionTree, RandomForest}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
/**
  *对特征值进行标准化处理
  *
  */
object IJCAISparkRFPrecisionV2 {
  /**
    * 贷出模式:贷出函数
    * @param args
    *         参数
    * @param operation
    *                用户函数
    */
  def sparkOperation(args: Array[String])(operation:SparkSession=>Unit):Unit={
    if(args.length != 2){
      println("Usage:SparkMLlibTemplateSpark  ")
      System.exit(1)
    }
    val spark = SparkSession
      .builder()
      .appName(args(0))
      .master(args(1))
      .getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    try {
      //调用用户函数
      operation(spark)
    }catch {
      case  e:Exception=>e.printStackTrace()
    }finally {
      spark.stop()
    }

  }

  /**
    * 贷出模式中用户函数
    *     针对于机器学习来讲,数据预处理,实训模型,评估模型和测试模型
    * @param spark
    */
  def modelTrain(spark:SparkSession):Unit= {
    //读取数据
    val userVisitPayRDD: RDD[Row] = spark.read
      .option("header","true")
      .csv("file:///F:\\IJCAI\\train_user_visit_pay")
      .select("day_week","shop_id","count_visit","count_pay")
      .rdd

    //  println(s"user Visit Pay count:${userVisitPayRDD.count()} ")
    // userVisitPayRDD.take(5).foreach(println)



    //进行count_visit字段的数据进行 标准化处理
    val countVisitRDD = userVisitPayRDD
      .map(row =>Vectors.dense(row.getString(2).toDouble))
    //标准化,此列均值和标准差
    val standardMode: StandardScalerModel =  new StandardScaler(true,true).fit(countVisitRDD)

    //转换数据
    val lpRDD: RDD[LabeledPoint] = userVisitPayRDD.map{
      case Row(day_week:String,shop_id:String,count_visit:String,count_pay:String)=>
        //标签
        val label: Double = count_pay.toDouble

        val countVisitScalaer: Vector = standardMode.transform(Vectors.dense(count_visit.toDouble))

        //特征向量
        val features: Vector = Vectors.dense(
          //根据实际情况,增加或者减少特征值
          Array(day_week.toDouble,shop_id.toDouble,countVisitScalaer.toArray(0))
        )
        //返回标签向量
        LabeledPoint(label,features)
    }


    //偷懒,将数据划分为2份,一份用来训练,一份用来测试
    val Array(traningRDD,testingRDD) = lpRDD.randomSplit(Array(0.8,0.2),123L)

    /**
      * 使用决策树算法预测
      * def trainRegressor(
      * input: RDD[LabeledPoint],
      * categoricalFeaturesInfo: Map[Int, Int],
      *   An entry (n to k)ndicates that feature n is categorical with k categories
      *     n:表示的是  那个特征值是类别数据,写下标,从0 开始
      *     k:表示特征性类别个数,星期:7
      * impurity: String,
      *     计算特征值重要性指标,此处是回归,写死:variance
      * maxDepth: Int,
      *   最大数的深度
      * maxBins: Int
      *   树的最大分裂区间数
      * ): DecisionTreeModel = {
      */
    val dtModel: DecisionTreeModel = DecisionTree.trainRegressor(
      traningRDD,
      Map[Int, Int](0->7),
      "variance",
      16,
      32
    )

    //使用模型进行预测--得到预测值和实际值的比较
    val actualAndPredictRDD: RDD[(Double, Double)] = testingRDD.map{
      case LabeledPoint(label,features)=>
        val predictValue = dtModel.predict(features)
        (label,predictValue)
    }

    actualAndPredictRDD.take(20).foreach(println)


    //评估模型预测性能
    def modelEvaluate(apRDD: RDD[(Double, Double)]):Unit = {

      //总数
      val count = apRDD.count().toDouble
      //MSE:均方误差
      val mseValue = apRDD
        .map{case(actual,predict)=>Math.pow(actual-predict,2)}
        .sum()/count
      //RMSE:均方根误差
      val rmseValue = Math.sqrt(mseValue)

      //MAE平均绝对误差
      val maeValue = apRDD
        .map{case(actual,predict)=>Math.abs(actual-predict)}
        .sum()/count

      println(s"MSE:$mseValue,RMSE:${rmseValue},MAE:${maeValue}")

    }

    //评估:决策树回归模型性能
    modelEvaluate(actualAndPredictRDD)


    println("\n\n ----------------------------RandomForset---------")

    /**
      * 森林里面有很多棵数,随机森林回归算法
      *   包含N棵数,得到模型中也有N棵树
      *   预测时,让所有的数的模型进行预测,获取平局值
      * def trainRegressor(
      * input: RDD[LabeledPoint],
      * categoricalFeaturesInfo: Map[Int, Int],
      * numTrees: Int,
      * featureSubsetStrategy: String,
      *     特征采样的方式,选用auto:代表1/3采样,构建每棵树的特征值不一样
      * impurity: String,
      * maxDepth: Int,
      * maxBins: Int,
      * seed: Int = Utils.random.nextInt()):
      */
    val rfModel: RandomForestModel = RandomForest.trainRegressor(
      lpRDD,
      Map[Int, Int](0->7),
      16,
      "auto",
      "variance",
      4,
      100

    )
    //使用模型进行预测
    val actualAndPredictRFRDD: RDD[(Double, Double)] = testingRDD.map{
      case LabeledPoint(label,features)=>
        val predictValue = rfModel.predict(features)
        (label,predictValue)
    }
    actualAndPredictRFRDD.take(20).foreach(println)
    modelEvaluate(actualAndPredictRFRDD)

  }

  def main(args: Array[String]): Unit = {
    sparkOperation(args)(modelTrain)
  }
}

你可能感兴趣的:(Spark)