Pipeline-Scala应用

Pipeline可以端对端的输出模型结果,中间DataFrame的转换、预处理等等都收归在Pipeline的各个stage中。相对而言,比较简洁和方便。用LR把Pipeline的整体流程跑通。

import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
      .builder()
      .appName("LR")
      .config("spark.executor.heartbeatInterval","60s")
      .config("spark.network.timeout","120s")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.kryoserializer.buffer.max","512m")
      .config("spark.dynamicAllocation.enabled", false)
      .config("spark.sql.inMemoryColumnarStorage.compressed", true)
      .config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
      .config("spark.sql.broadcastTimeout", 600)
      .config("spark.sql.autoBroadcastJoinThreshold", -1)
      .config("spark.sql.crossJoin.enabled", true)
      .master("local[*]") 
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@3eb5682d
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@22d2df5b
import spark.implicits._

构造数据

var dfTrain = Seq(
    (1, 5.1, 3.5, 1.4, 0.2, 0),
    (2, 4.9, 3.0, 1.4, 0.2, 0),
    (3, 4.7, 3.2, 1.3, 0.2, 0),
    (4, 4.6, 3.1, 1.5, 0.2, 0),
    (5, 5.0, 3.6, 1.4, 0.2, 0),
    (56, 5.7, 2.8, 4.5, 1.3,1),
    (57, 6.3, 3.3, 4.7, 1.6,1),
    (58, 4.9, 2.4, 3.3, 1.0,1),
    (59, 6.6, 2.9, 4.6, 1.3,1),
    (60, 5.2, 2.7, 3.9, 1.4,1)
  ).toDF("id","x1","x2", "x3","x4","label")
// 测试集直接copy就行了,仅用来测试
var dfTest = dfTrain
dfTrain.show()
+---+---+---+---+---+-----+
| id| x1| x2| x3| x4|label|
+---+---+---+---+---+-----+
|  1|5.1|3.5|1.4|0.2|    0|
|  2|4.9|3.0|1.4|0.2|    0|
|  3|4.7|3.2|1.3|0.2|    0|
|  4|4.6|3.1|1.5|0.2|    0|
|  5|5.0|3.6|1.4|0.2|    0|
| 56|5.7|2.8|4.5|1.3|    1|
| 57|6.3|3.3|4.7|1.6|    1|
| 58|4.9|2.4|3.3|1.0|    1|
| 59|6.6|2.9|4.6|1.3|    1|
| 60|5.2|2.7|3.9|1.4|    1|
+---+---+---+---+---+-----+






dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]

特征向量

val assemble = new VectorAssembler()
      .setInputCols(Array("x1","x2","x3","x4"))
      .setOutputCol("features")
assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_8ab10f1b2eb3

模型

val lr = new LogisticRegression().
            setMaxIter(10).
            setRegParam(0.01)
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_05d80788e8f8

Pipeline

预处理可以很复杂,总之顺序往pipeline里丢就可以了

val pipeline = new Pipeline().setStages(Array(assemble, lr))
pipeline: org.apache.spark.ml.Pipeline = pipeline_9ad3b8d2f213

模型训练

val lrModel = pipeline.fit(dfTrain)
lrModel: org.apache.spark.ml.PipelineModel = pipeline_9ad3b8d2f213

测试集预测

在这一步提现了pipeline的优势,即我们不需要对测试集单独执行预处理的过程,只需要把数据集传进PipelineModel即可。

lrModel.transform(dfTest).show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
| id| x1| x2| x3| x4|label|         features|       rawPrediction|         probability|prediction|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
|  1|5.1|3.5|1.4|0.2|    0|[5.1,3.5,1.4,0.2]|[4.65895888324787...|[0.99061263448333...|       0.0|
|  2|4.9|3.0|1.4|0.2|    0|[4.9,3.0,1.4,0.2]|[3.13931361151191...|[0.95848557722426...|       0.0|
|  3|4.7|3.2|1.3|0.2|    0|[4.7,3.2,1.3,0.2]|[4.15591528867079...|[0.98457036395474...|       0.0|
|  4|4.6|3.1|1.5|0.2|    0|[4.6,3.1,1.5,0.2]|[3.71742556580948...|[0.97627987748390...|       0.0|
|  5|5.0|3.6|1.4|0.2|    0|[5.0,3.6,1.4,0.2]|[5.11709714014036...|[0.99404231113144...|       0.0|
| 56|5.7|2.8|4.5|1.3|    1|[5.7,2.8,4.5,1.3]|[-4.4688764824259...|[0.01133033659270...|       1.0|
| 57|6.3|3.3|4.7|1.6|    1|[6.3,3.3,4.7,1.6]|[-4.3870984226026...|[0.01228399007333...|       1.0|
| 58|4.9|2.4|3.3|1.0|    1|[4.9,2.4,3.3,1.0]|[-2.9791152212283...|[0.04837834578750...|       1.0|
| 59|6.6|2.9|4.6|1.3|    1|[6.6,2.9,4.6,1.3]|[-5.2125576928023...|[0.00541820537550...|       1.0|
| 60|5.2|2.7|3.9|1.4|    1|[5.2,2.7,3.9,1.4]|[-3.9297068979403...|[0.01927077130674...|       1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+

pipeline还有一些复杂的用法,暂时没有用到,就不体现了。后续再补充吧

                                2020-03-24 于南京市江宁区九龙湖

你可能感兴趣的:(★★★机器学习,#,★★模型选择和评估)