Pipeline可以端对端的输出模型结果,中间DataFrame的转换、预处理等等都收归在Pipeline的各个stage中。相对而言,比较简洁和方便。用LR把Pipeline的整体流程跑通。
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
.builder()
.appName("LR")
.config("spark.executor.heartbeatInterval","60s")
.config("spark.network.timeout","120s")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.kryoserializer.buffer.max","512m")
.config("spark.dynamicAllocation.enabled", false)
.config("spark.sql.inMemoryColumnarStorage.compressed", true)
.config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
.config("spark.sql.broadcastTimeout", 600)
.config("spark.sql.autoBroadcastJoinThreshold", -1)
.config("spark.sql.crossJoin.enabled", true)
.master("local[*]")
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@3eb5682d
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@22d2df5b
import spark.implicits._
var dfTrain = Seq(
(1, 5.1, 3.5, 1.4, 0.2, 0),
(2, 4.9, 3.0, 1.4, 0.2, 0),
(3, 4.7, 3.2, 1.3, 0.2, 0),
(4, 4.6, 3.1, 1.5, 0.2, 0),
(5, 5.0, 3.6, 1.4, 0.2, 0),
(56, 5.7, 2.8, 4.5, 1.3,1),
(57, 6.3, 3.3, 4.7, 1.6,1),
(58, 4.9, 2.4, 3.3, 1.0,1),
(59, 6.6, 2.9, 4.6, 1.3,1),
(60, 5.2, 2.7, 3.9, 1.4,1)
).toDF("id","x1","x2", "x3","x4","label")
// 测试集直接copy就行了,仅用来测试
var dfTest = dfTrain
dfTrain.show()
+---+---+---+---+---+-----+
| id| x1| x2| x3| x4|label|
+---+---+---+---+---+-----+
| 1|5.1|3.5|1.4|0.2| 0|
| 2|4.9|3.0|1.4|0.2| 0|
| 3|4.7|3.2|1.3|0.2| 0|
| 4|4.6|3.1|1.5|0.2| 0|
| 5|5.0|3.6|1.4|0.2| 0|
| 56|5.7|2.8|4.5|1.3| 1|
| 57|6.3|3.3|4.7|1.6| 1|
| 58|4.9|2.4|3.3|1.0| 1|
| 59|6.6|2.9|4.6|1.3| 1|
| 60|5.2|2.7|3.9|1.4| 1|
+---+---+---+---+---+-----+
dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
val assemble = new VectorAssembler()
.setInputCols(Array("x1","x2","x3","x4"))
.setOutputCol("features")
assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_8ab10f1b2eb3
val lr = new LogisticRegression().
setMaxIter(10).
setRegParam(0.01)
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_05d80788e8f8
预处理可以很复杂,总之顺序往pipeline里丢就可以了
val pipeline = new Pipeline().setStages(Array(assemble, lr))
pipeline: org.apache.spark.ml.Pipeline = pipeline_9ad3b8d2f213
val lrModel = pipeline.fit(dfTrain)
lrModel: org.apache.spark.ml.PipelineModel = pipeline_9ad3b8d2f213
在这一步提现了pipeline的优势,即我们不需要对测试集单独执行预处理的过程,只需要把数据集传进PipelineModel即可。
lrModel.transform(dfTest).show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
| id| x1| x2| x3| x4|label| features| rawPrediction| probability|prediction|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
| 1|5.1|3.5|1.4|0.2| 0|[5.1,3.5,1.4,0.2]|[4.65895888324787...|[0.99061263448333...| 0.0|
| 2|4.9|3.0|1.4|0.2| 0|[4.9,3.0,1.4,0.2]|[3.13931361151191...|[0.95848557722426...| 0.0|
| 3|4.7|3.2|1.3|0.2| 0|[4.7,3.2,1.3,0.2]|[4.15591528867079...|[0.98457036395474...| 0.0|
| 4|4.6|3.1|1.5|0.2| 0|[4.6,3.1,1.5,0.2]|[3.71742556580948...|[0.97627987748390...| 0.0|
| 5|5.0|3.6|1.4|0.2| 0|[5.0,3.6,1.4,0.2]|[5.11709714014036...|[0.99404231113144...| 0.0|
| 56|5.7|2.8|4.5|1.3| 1|[5.7,2.8,4.5,1.3]|[-4.4688764824259...|[0.01133033659270...| 1.0|
| 57|6.3|3.3|4.7|1.6| 1|[6.3,3.3,4.7,1.6]|[-4.3870984226026...|[0.01228399007333...| 1.0|
| 58|4.9|2.4|3.3|1.0| 1|[4.9,2.4,3.3,1.0]|[-2.9791152212283...|[0.04837834578750...| 1.0|
| 59|6.6|2.9|4.6|1.3| 1|[6.6,2.9,4.6,1.3]|[-5.2125576928023...|[0.00541820537550...| 1.0|
| 60|5.2|2.7|3.9|1.4| 1|[5.2,2.7,3.9,1.4]|[-3.9297068979403...|[0.01927077130674...| 1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
pipeline还有一些复杂的用法,暂时没有用到,就不体现了。后续再补充吧
2020-03-24 于南京市江宁区九龙湖