sparkmlib4_time
- 创建数据、模型、预测表
- 1、数据存入mysql
- 2、读取数据、模型预测
- 3、模型保存
- 4、mysql 数据加载、模型预测保存
- ?mysql模型加载,预测保存
- ?hdfs数据读入、存储
cd /opt/module/spark-standalone
sbin/start-all.sh
bin/spark-shell
创建数据、模型、预测表
SHOW DATABASES;
USE mydb;
SHOW TABLES;
CREATE TABLE flowers8 (
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
sepal_length FLOAT,
sepal_width FLOAT,
petal_length FLOAT,
petal_width FLOAT,
species VARCHAR(255),
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE models8 (
id INT NOT NULL AUTO_INCREMENT,
name VARCHAR(50) NOT NULL,
content BLOB NOT NULL,
PRIMARY KEY (id),
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE predicted_flowers8 (
id INT NOT NULL AUTO_INCREMENT,
indexedFeatures VARCHAR(255),
label VARCHAR(255),
rawPrediction VARCHAR(255),
probability VARCHAR(255),
prediction DOUBLE,
PRIMARY KEY (id),
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
1、数据存入mysql
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.spark.sql.SparkSession
import java.util.Properties
val url = "jdbc:mysql://hadoop102:3306/mydb"
val table = "flowers8"
val user = "root"
val password = "000000"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val spark = SparkSession.builder().appName("insert-data").config("spark.master", "local").getOrCreate()
val cleanedData = spark.read.text("file:///opt/module/spark-standalone/data/iris.data.txt").filter(row => row.getAs[String]("value").trim != "").map(row => row.getAs[String]("value").split(",")).map(cols => (cols(0).toDouble, cols(1).toDouble, cols(2).toDouble, cols(3).toDouble, cols(4))).toDF("sepal_length", "sepal_width", "petal_length", "petal_width", "species")
cleanedData.write.mode("append").jdbc(url, table, props)
spark.stop()
2、读取数据、模型预测
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
import java.util.Properties
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val url = "jdbc:mysql://hadoop102:3306/mydb"
val user = "root"
val password = "000000"
val table = "flowers8"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val irisDF = spark.read.jdbc(url, table, props)
val data = irisDF.map { row =>
Iris(
Vectors.dense(row.getAs[Double]("sepal_length"), row.getAs[Double]("sepal_width"),
row.getAs[Double]("petal_length"), row.getAs[Double]("petal_width")),
row.getAs[String]("species")
)
}
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(data)
val lr=new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConverter))
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
val lrPipelineModel = lrPipeline.fit(trainingData)
val lrPredictions=lrPipelineModel.transform(testData)
lrPredictions.
select("predictedLabel","label","features","probability").collect().
foreach{case Row(predictedLabel:String,label:String,features:Vector,prob:Vector)=>println(s"($label,$features) -->prob=$prob,predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Coefficients: \n "+ lrModel.coefficientMatrix++ "\nIntercept:"+lrModel.interceptVector+"\n numClasses: "+lrModel.numClasses+"\n numFeatures: "+lrModel.numFeatures)
lrModel.save("/opt/module/spark-standalone/data/lrModel9")
val localModel = LogisticRegressionModel.load("/opt/module/spark-standalone/data/lrModel9")
val predictions1 = localModel.transform(testData.withColumnRenamed("features", "indexedFeatures"))
predictions1 .show()
3、模型保存
import java.io.ByteArrayOutputStream
import java.io.ObjectOutputStream
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
val model = LogisticRegressionModel.load("/opt/module/spark-standalone/data/lrModel9")
val bos = new ByteArrayOutputStream()
val oos = new ObjectOutputStream(bos)
oos.writeObject(model)
oos.flush()
val bytes = bos.toByteArray()
val conn: Connection = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/mydb", "root", "000000")
val stmt: PreparedStatement = conn.prepareStatement("INSERT INTO models8 (name, content) VALUES (?, ?)")
stmt.setString(1, "my_model")
stmt.setBytes(2, bytes)
stmt.executeUpdate()
4、mysql 数据加载、模型预测保存
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
import java.util.Properties
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val url = "jdbc:mysql://hadoop102:3306/mydb"
val user = "root"
val password = "000000"
val table = "flowers5"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val irisDF = spark.read.jdbc(url, table, props)
irisDF
val data = irisDF.map { row =>
Iris(
Vectors.dense(row.getAs[Double]("sepal_length"), row.getAs[Double]("sepal_width"),
row.getAs[Double]("petal_length"), row.getAs[Double]("petal_width")),
row.getAs[String]("species")
)
}
data
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(data)
val lr=new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConverter))
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
val lrPipelineModel = lrPipeline.fit(trainingData)
val lrPredictions=lrPipelineModel.transform(testData)
lrPredictions.
select("predictedLabel","label","features","probability").collect().
foreach{case Row(predictedLabel:String,label:String,features:Vector,prob:Vector)=>println(s"($label,$features) -->prob=$prob,predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Coefficients: \n "+ lrModel.coefficientMatrix++ "\nIntercept:"+lrModel.interceptVector+"\n numClasses: "+lrModel.numClasses+"\n numFeatures: "+lrModel.numFeatures)
val localModel = LogisticRegressionModel.load("/opt/module/spark-standalone/data/lrModel8")
val predictions1 = localModel.transform(testData.withColumnRenamed("features", "indexedFeatures"))
predictions1 .show()
import java.util.Properties
val url = "jdbc:mysql://hadoop102:3306/mydb"
val table = "predicted_flowers8"
val user = "root"
val password = "000000"
val props = new Properties()
props.put("user", user)
props.put("password", password)
predictions1.select("indexedFeatures", "label", "rawPrediction", "probability", "prediction").foreach { row =>
val indexedFeatures = row.getAs[org.apache.spark.ml.linalg.Vector]("indexedFeatures")
val label = row.getAs[String]("label")
val rawPrediction = row.getAs[org.apache.spark.ml.linalg.Vector]("rawPrediction")
val probability = row.getAs[org.apache.spark.ml.linalg.Vector]("probability")
val prediction = row.getAs[Double]("prediction")
val conn = java.sql.DriverManager.getConnection(url, user, password)
try {
val stmt = conn.createStatement()
val sql = s"""INSERT INTO $table
|(indexedFeatures, label, rawPrediction, probability, prediction)
|VALUES ('${indexedFeatures.toArray.mkString(",")}', '$label', '${rawPrediction.toArray.mkString(",")}', '${probability.toArray.mkString(",")}', $prediction)
|""".stripMargin
stmt.executeUpdate(sql)
} finally {
conn.close()
}
()
}
?mysql模型加载,预测保存
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
import java.util.Properties
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val url = "jdbc:mysql://hadoop102:3306/mydb"
val user = "root"
val password = "000000"
val table = "flowers5"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val irisDF = spark.read.jdbc(url, table, props)
irisDF
val data = irisDF.map { row =>
Iris(
Vectors.dense(row.getAs[Double]("sepal_length"), row.getAs[Double]("sepal_width"),
row.getAs[Double]("petal_length"), row.getAs[Double]("petal_width")),
row.getAs[String]("species")
)
}
data
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(data)
val lr=new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConverter))
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
val lrPipelineModel = lrPipeline.fit(trainingData)
val lrPredictions=lrPipelineModel.transform(testData)
lrPredictions.
select("predictedLabel","label","features","probability").collect().
foreach{case Row(predictedLabel:String,label:String,features:Vector,prob:Vector)=>println(s"($label,$features) -->prob=$prob,predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Coefficients: \n "+ lrModel.coefficientMatrix++ "\nIntercept:"+lrModel.interceptVector+"\n numClasses: "+lrModel.numClasses+"\n numFeatures: "+lrModel.numFeatures)
import java.io.ByteArrayInputStream
import java.io.ObjectInputStream
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
val conn: Connection = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/mydb", "root", "000000")
val query: String = "SELECT content FROM models3 WHERE name = 'my_model'"
val stmt = conn.prepareStatement(query)
val rs: ResultSet = stmt.executeQuery(query)
var loadedLRModel: LogisticRegressionModel = null
if (rs.next()) {
val bytes: Array[Byte] = rs.getBytes("content")
val bis = new ByteArrayInputStream(bytes)
val ois = new ObjectInputStream(bis)
loadedLRModel = ois.readObject().asInstanceOf[LogisticRegressionModel]
}
rs.close()
stmt.close()
conn.close()
val predictions1 = loadedLRModel.transform(testData.withColumnRenamed("features", "indexedFeatures"))
predictions1 .show()
import java.util.Properties
val url = "jdbc:mysql://hadoop102:3306/mydb"
val table = "predicted_flowers6"
val user = "root"
val password = "000000"
val props = new Properties()
props.put("user", user)
props.put("password", password)
predictions1.select("indexedFeatures", "label", "rawPrediction", "probability", "prediction").foreach { row =>
val indexedFeatures = row.getAs[org.apache.spark.ml.linalg.Vector]("indexedFeatures")
val label = row.getAs[String]("label")
val rawPrediction = row.getAs[org.apache.spark.ml.linalg.Vector]("rawPrediction")
val probability = row.getAs[org.apache.spark.ml.linalg.Vector]("probability")
val prediction = row.getAs[Double]("prediction")
val conn = java.sql.DriverManager.getConnection(url, user, password)
try {
val stmt = conn.createStatement()
val sql = s"""INSERT INTO $table
|(indexedFeatures, label, rawPrediction, probability, prediction)
|VALUES ('${indexedFeatures.toArray.mkString(",")}', '$label', '${rawPrediction.toArray.mkString(",")}', '${probability.toArray.mkString(",")}', $prediction)
|""".stripMargin
stmt.executeUpdate(sql)
} finally {
conn.close()
}
()
}
?hdfs数据读入、存储
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val irisDF = sc.textFile("hdfs://hadoop102:9870/pdwcs/iris.data.txt")
val data = irisDF.map { line =>
val parts = line.split(",")
Iris(
Vectors.dense(parts(0).toDouble, parts(1).toDouble, parts(2).toDouble, parts(3).toDouble),
parts(4)
)
}
val df= data.toDF()
df.show()