第四步使用随机森林回归算法
package com.huadian.bigdata.ijcai
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.{DecisionTreeModel, RandomForestModel}
import org.apache.spark.mllib.tree.{DecisionTree, RandomForest}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
/**
*对特征值进行标准化处理
*
*/
object IJCAISparkRFPrecisionV2 {
/**
* 贷出模式:贷出函数
* @param args
* 参数
* @param operation
* 用户函数
*/
def sparkOperation(args: Array[String])(operation:SparkSession=>Unit):Unit={
if(args.length != 2){
println("Usage:SparkMLlibTemplateSpark ")
System.exit(1)
}
val spark = SparkSession
.builder()
.appName(args(0))
.master(args(1))
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
try {
//调用用户函数
operation(spark)
}catch {
case e:Exception=>e.printStackTrace()
}finally {
spark.stop()
}
}
/**
* 贷出模式中用户函数
* 针对于机器学习来讲,数据预处理,实训模型,评估模型和测试模型
* @param spark
*/
def modelTrain(spark:SparkSession):Unit= {
//读取数据
val userVisitPayRDD: RDD[Row] = spark.read
.option("header","true")
.csv("file:///F:\\IJCAI\\train_user_visit_pay")
.select("day_week","shop_id","count_visit","count_pay")
.rdd
// println(s"user Visit Pay count:${userVisitPayRDD.count()} ")
// userVisitPayRDD.take(5).foreach(println)
//进行count_visit字段的数据进行 标准化处理
val countVisitRDD = userVisitPayRDD
.map(row =>Vectors.dense(row.getString(2).toDouble))
//标准化,此列均值和标准差
val standardMode: StandardScalerModel = new StandardScaler(true,true).fit(countVisitRDD)
//转换数据
val lpRDD: RDD[LabeledPoint] = userVisitPayRDD.map{
case Row(day_week:String,shop_id:String,count_visit:String,count_pay:String)=>
//标签
val label: Double = count_pay.toDouble
val countVisitScalaer: Vector = standardMode.transform(Vectors.dense(count_visit.toDouble))
//特征向量
val features: Vector = Vectors.dense(
//根据实际情况,增加或者减少特征值
Array(day_week.toDouble,shop_id.toDouble,countVisitScalaer.toArray(0))
)
//返回标签向量
LabeledPoint(label,features)
}
//偷懒,将数据划分为2份,一份用来训练,一份用来测试
val Array(traningRDD,testingRDD) = lpRDD.randomSplit(Array(0.8,0.2),123L)
/**
* 使用决策树算法预测
* def trainRegressor(
* input: RDD[LabeledPoint],
* categoricalFeaturesInfo: Map[Int, Int],
* An entry (n to k)ndicates that feature n is categorical with k categories
* n:表示的是 那个特征值是类别数据,写下标,从0 开始
* k:表示特征性类别个数,星期:7
* impurity: String,
* 计算特征值重要性指标,此处是回归,写死:variance
* maxDepth: Int,
* 最大数的深度
* maxBins: Int
* 树的最大分裂区间数
* ): DecisionTreeModel = {
*/
val dtModel: DecisionTreeModel = DecisionTree.trainRegressor(
traningRDD,
Map[Int, Int](0->7),
"variance",
16,
32
)
//使用模型进行预测--得到预测值和实际值的比较
val actualAndPredictRDD: RDD[(Double, Double)] = testingRDD.map{
case LabeledPoint(label,features)=>
val predictValue = dtModel.predict(features)
(label,predictValue)
}
actualAndPredictRDD.take(20).foreach(println)
//评估模型预测性能
def modelEvaluate(apRDD: RDD[(Double, Double)]):Unit = {
//总数
val count = apRDD.count().toDouble
//MSE:均方误差
val mseValue = apRDD
.map{case(actual,predict)=>Math.pow(actual-predict,2)}
.sum()/count
//RMSE:均方根误差
val rmseValue = Math.sqrt(mseValue)
//MAE平均绝对误差
val maeValue = apRDD
.map{case(actual,predict)=>Math.abs(actual-predict)}
.sum()/count
println(s"MSE:$mseValue,RMSE:${rmseValue},MAE:${maeValue}")
}
//评估:决策树回归模型性能
modelEvaluate(actualAndPredictRDD)
println("\n\n ----------------------------RandomForset---------")
/**
* 森林里面有很多棵数,随机森林回归算法
* 包含N棵数,得到模型中也有N棵树
* 预测时,让所有的数的模型进行预测,获取平局值
* def trainRegressor(
* input: RDD[LabeledPoint],
* categoricalFeaturesInfo: Map[Int, Int],
* numTrees: Int,
* featureSubsetStrategy: String,
* 特征采样的方式,选用auto:代表1/3采样,构建每棵树的特征值不一样
* impurity: String,
* maxDepth: Int,
* maxBins: Int,
* seed: Int = Utils.random.nextInt()):
*/
val rfModel: RandomForestModel = RandomForest.trainRegressor(
lpRDD,
Map[Int, Int](0->7),
16,
"auto",
"variance",
4,
100
)
//使用模型进行预测
val actualAndPredictRFRDD: RDD[(Double, Double)] = testingRDD.map{
case LabeledPoint(label,features)=>
val predictValue = rfModel.predict(features)
(label,predictValue)
}
actualAndPredictRFRDD.take(20).foreach(println)
modelEvaluate(actualAndPredictRFRDD)
}
def main(args: Array[String]): Unit = {
sparkOperation(args)(modelTrain)
}
}