sparkmlib调用

sparkmlib4_time

  • 创建数据、模型、预测表
  • 1、数据存入mysql
  • 2、读取数据、模型预测
  • 3、模型保存
  • 4、mysql 数据加载、模型预测保存
  • ?mysql模型加载,预测保存
  • ?hdfs数据读入、存储

cd /opt/module/spark-standalone
sbin/start-all.sh
bin/spark-shell

创建数据、模型、预测表

SHOW DATABASES;
USE mydb;
SHOW TABLES;


CREATE TABLE flowers8 (
  id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
  sepal_length FLOAT,
  sepal_width FLOAT,
  petal_length FLOAT,
  petal_width FLOAT,
  species VARCHAR(255),
  created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);


CREATE TABLE models8 (
    id INT NOT NULL AUTO_INCREMENT,
    name VARCHAR(50) NOT NULL,
    content BLOB NOT NULL,
    PRIMARY KEY (id),
    created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);


CREATE TABLE predicted_flowers8 (
    id INT NOT NULL AUTO_INCREMENT,
    indexedFeatures VARCHAR(255),
    label VARCHAR(255),
    rawPrediction VARCHAR(255),
    probability VARCHAR(255),
    prediction DOUBLE,
    PRIMARY KEY (id),
    created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);

1、数据存入mysql


import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.spark.sql.SparkSession
import java.util.Properties

val url = "jdbc:mysql://hadoop102:3306/mydb"
val table = "flowers8"
val user = "root"
val password = "000000"

val props = new Properties()
props.put("user", user)
props.put("password", password)

val spark = SparkSession.builder().appName("insert-data").config("spark.master", "local").getOrCreate()
val cleanedData = spark.read.text("file:///opt/module/spark-standalone/data/iris.data.txt").filter(row => row.getAs[String]("value").trim != "").map(row => row.getAs[String]("value").split(",")).map(cols => (cols(0).toDouble, cols(1).toDouble, cols(2).toDouble, cols(3).toDouble, cols(4))).toDF("sepal_length", "sepal_width", "petal_length", "petal_width", "species")

cleanedData.write.mode("append").jdbc(url, table, props)
spark.stop()

2、读取数据、模型预测

import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
import java.util.Properties
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val url = "jdbc:mysql://hadoop102:3306/mydb"
val user = "root"
val password = "000000"
val table = "flowers8"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val irisDF = spark.read.jdbc(url, table, props) // 从MySQL数据库读取数据

val data = irisDF.map { row =>
  Iris(
    Vectors.dense(row.getAs[Double]("sepal_length"), row.getAs[Double]("sepal_width"),
                 row.getAs[Double]("petal_length"), row.getAs[Double]("petal_width")),
    row.getAs[String]("species")
  )
}
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(data)
val lr=new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConverter))
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
val lrPipelineModel = lrPipeline.fit(trainingData)
val lrPredictions=lrPipelineModel.transform(testData)
lrPredictions.
select("predictedLabel","label","features","probability").collect().
foreach{case Row(predictedLabel:String,label:String,features:Vector,prob:Vector)=>println(s"($label,$features) -->prob=$prob,predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Coefficients: \n "+ lrModel.coefficientMatrix++ "\nIntercept:"+lrModel.interceptVector+"\n numClasses: "+lrModel.numClasses+"\n numFeatures: "+lrModel.numFeatures)


//本地保存模型
lrModel.save("/opt/module/spark-standalone/data/lrModel9")
val localModel = LogisticRegressionModel.load("/opt/module/spark-standalone/data/lrModel9")
val predictions1 = localModel.transform(testData.withColumnRenamed("features", "indexedFeatures"))
predictions1 .show()

3、模型保存

import java.io.ByteArrayOutputStream
import java.io.ObjectOutputStream
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
val model = LogisticRegressionModel.load("/opt/module/spark-standalone/data/lrModel9")
val bos = new ByteArrayOutputStream()
val oos = new ObjectOutputStream(bos)
oos.writeObject(model)
oos.flush()
val bytes = bos.toByteArray()
val conn: Connection = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/mydb", "root", "000000")
val stmt: PreparedStatement = conn.prepareStatement("INSERT INTO models8 (name, content) VALUES (?, ?)")
stmt.setString(1, "my_model")
stmt.setBytes(2, bytes)
stmt.executeUpdate()

4、mysql 数据加载、模型预测保存

//mysql 加载数据
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
import java.util.Properties
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val url = "jdbc:mysql://hadoop102:3306/mydb"
val user = "root"
val password = "000000"
val table = "flowers5"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val irisDF = spark.read.jdbc(url, table, props) // 从MySQL数据库读取数据
irisDF
val data = irisDF.map { row =>
   Iris(
     Vectors.dense(row.getAs[Double]("sepal_length"), row.getAs[Double]("sepal_width"),
                  row.getAs[Double]("petal_length"), row.getAs[Double]("petal_width")),
     row.getAs[String]("species")
   )
 }
data
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(data)
val lr=new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConverter))
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
val lrPipelineModel = lrPipeline.fit(trainingData)
val lrPredictions=lrPipelineModel.transform(testData)
lrPredictions.
 select("predictedLabel","label","features","probability").collect().
 foreach{case Row(predictedLabel:String,label:String,features:Vector,prob:Vector)=>println(s"($label,$features) -->prob=$prob,predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().
 setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Coefficients: \n "+ lrModel.coefficientMatrix++ "\nIntercept:"+lrModel.interceptVector+"\n numClasses: "+lrModel.numClasses+"\n numFeatures: "+lrModel.numFeatures)                                                                          
val localModel = LogisticRegressionModel.load("/opt/module/spark-standalone/data/lrModel8")
val predictions1 = localModel.transform(testData.withColumnRenamed("features", "indexedFeatures"))
predictions1 .show()


//mysql预测保存
import java.util.Properties
// 配置 MySQL 连接参数
val url = "jdbc:mysql://hadoop102:3306/mydb"
val table = "predicted_flowers8"
val user = "root"
val password = "000000"
val props = new Properties()
props.put("user", user)
props.put("password", password)
// 将预测结果写入 MySQL 表
predictions1.select("indexedFeatures", "label", "rawPrediction", "probability", "prediction").foreach { row =>
  val indexedFeatures = row.getAs[org.apache.spark.ml.linalg.Vector]("indexedFeatures")
  val label = row.getAs[String]("label")
  val rawPrediction = row.getAs[org.apache.spark.ml.linalg.Vector]("rawPrediction")
  val probability = row.getAs[org.apache.spark.ml.linalg.Vector]("probability")
  val prediction = row.getAs[Double]("prediction")
  // 将数据插入 MySQL 表中
  val conn = java.sql.DriverManager.getConnection(url, user, password)
  try {
    val stmt = conn.createStatement()
    val sql = s"""INSERT INTO $table
         |(indexedFeatures, label, rawPrediction, probability, prediction)
         |VALUES ('${indexedFeatures.toArray.mkString(",")}', '$label', '${rawPrediction.toArray.mkString(",")}', '${probability.toArray.mkString(",")}', $prediction)
         |""".stripMargin
    stmt.executeUpdate(sql)
  } finally {
    conn.close()
  }
  // 返回 Unit
  ()
}


?mysql模型加载,预测保存

//mysql 加载数据
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
import java.util.Properties
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val url = "jdbc:mysql://hadoop102:3306/mydb"
val user = "root"
val password = "000000"
val table = "flowers5"
val props = new Properties()
props.put("user", user)
props.put("password", password)
val irisDF = spark.read.jdbc(url, table, props) // 从MySQL数据库读取数据
irisDF
val data = irisDF.map { row =>
   Iris(
     Vectors.dense(row.getAs[Double]("sepal_length"), row.getAs[Double]("sepal_width"),
                  row.getAs[Double]("petal_length"), row.getAs[Double]("petal_width")),
     row.getAs[String]("species")
   )
 }
data
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(data)
val lr=new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConverter))
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
val lrPipelineModel = lrPipeline.fit(trainingData)
val lrPredictions=lrPipelineModel.transform(testData)
lrPredictions.
 select("predictedLabel","label","features","probability").collect().
 foreach{case Row(predictedLabel:String,label:String,features:Vector,prob:Vector)=>println(s"($label,$features) -->prob=$prob,predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().
 setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Coefficients: \n "+ lrModel.coefficientMatrix++ "\nIntercept:"+lrModel.interceptVector+"\n numClasses: "+lrModel.numClasses+"\n numFeatures: "+lrModel.numFeatures)                                                                          


// 加载mysql模型
import java.io.ByteArrayInputStream
import java.io.ObjectInputStream
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
val conn: Connection = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/mydb", "root", "000000")
val query: String = "SELECT content FROM models3 WHERE name = 'my_model'"
val stmt = conn.prepareStatement(query)
val rs: ResultSet = stmt.executeQuery(query)
var loadedLRModel: LogisticRegressionModel = null
if (rs.next()) {
  val bytes: Array[Byte] = rs.getBytes("content")
  val bis = new ByteArrayInputStream(bytes)
  val ois = new ObjectInputStream(bis)
  loadedLRModel = ois.readObject().asInstanceOf[LogisticRegressionModel]
}
rs.close()
stmt.close()
conn.close()

val predictions1 = loadedLRModel.transform(testData.withColumnRenamed("features", "indexedFeatures"))
predictions1 .show()




//mysql预测保存
import java.util.Properties
val url = "jdbc:mysql://hadoop102:3306/mydb"
val table = "predicted_flowers6"
val user = "root"
val password = "000000"
val props = new Properties()
props.put("user", user)
props.put("password", password)
predictions1.select("indexedFeatures", "label", "rawPrediction", "probability", "prediction").foreach { row =>
  val indexedFeatures = row.getAs[org.apache.spark.ml.linalg.Vector]("indexedFeatures")
  val label = row.getAs[String]("label")
  val rawPrediction = row.getAs[org.apache.spark.ml.linalg.Vector]("rawPrediction")
  val probability = row.getAs[org.apache.spark.ml.linalg.Vector]("probability")
  val prediction = row.getAs[Double]("prediction")
  val conn = java.sql.DriverManager.getConnection(url, user, password)
  try {
    val stmt = conn.createStatement()
    val sql = s"""INSERT INTO $table
         |(indexedFeatures, label, rawPrediction, probability, prediction)
         |VALUES ('${indexedFeatures.toArray.mkString(",")}', '$label', '${rawPrediction.toArray.mkString(",")}', '${probability.toArray.mkString(",")}', $prediction)
         |""".stripMargin
    stmt.executeUpdate(sql)
  } finally {
    conn.close()
  }
  ()
}

?hdfs数据读入、存储

import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.linalg.Vectors
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
val irisDF = sc.textFile("hdfs://hadoop102:9870/pdwcs/iris.data.txt")

val data = irisDF.map { line =>
  val parts = line.split(",")
  Iris(
    Vectors.dense(parts(0).toDouble, parts(1).toDouble, parts(2).toDouble, parts(3).toDouble),
    parts(4)
  )
}
val df= data.toDF()
df.show()

你可能感兴趣的:(#,Spark,大数据,spark,java)