文章目录
- 预处理test.csv和train.csv数据集
- 预测回头客
-
- 启动hadoop
- 启动MySQL服务
- 启动spark-shell
- 支持向量机SVM分类器预测回头客
预处理test.csv和train.csv数据集
预测回头客
启动hadoop
启动MySQL服务
启动spark-shell
cd ~/下载/
unzip mysql-connector-java-5.1.40.zip -d /usr/local/spark/jars
cd /usr/local/spark
./bin/spark-shell --jars /usr/local/spark/jars/mysql-connector-java-5.1.40/mysql-connector-java-5.1.40-bin.jar --driver-class-path /usr/local/spark/jars/mysql-connector-java-5.1.40/mysql-connector-java-5.1.40-bin.jar
支持向量机SVM分类器预测回头客
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vectors,Vector}
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import java.util.Properties
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
val train_data = sc.textFile("/dbtaobao/dataset/train_after.csv")
val test_data = sc.textFile("/dbtaobao/dataset/test_after.csv")
val train= train_data.map{line =>
val parts = line.split(',')
LabeledPoint(parts(4).toDouble,Vectors.dense(parts(1).toDouble,parts
(2).toDouble,parts(3).toDouble))
}
val test = test_data.map{line =>
val parts = line.split(',')
LabeledPoint(parts(4).toDouble,Vectors.dense(parts(1).toDouble,parts(2).toDouble,parts(3).toDouble))
}
val numIterations = 1000
val model = SVMWithSGD.train(train, numIterations)
model.clearThreshold()
val scoreAndLabels = test.map{point =>
val score = model.predict(point.features)
score+" "+point.label
}
scoreAndLabels.foreach(println)
model.setThreshold(0.0)
scoreAndLabels.foreach(println)
model.clearThreshold()
val scoreAndLabels = test.map{point =>
val score = model.predict(point.features)
score+" "+point.label
}
val rebuyRDD = scoreAndLabels.map(_.split(" "))
val schema = StructType(List(StructField("score", StringType, true),StructField("label", StringType, true)))
val rowRDD = rebuyRDD.map(p => Row(p(0).trim, p(1).trim))
val rebuyDF = spark.createDataFrame(rowRDD, schema)
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "root")
prop.put("driver","com.mysql.jdbc.Driver")
rebuyDF.write.mode("append").jdbc("jdbc:mysql://localhost:3306/dbtaobao", "dbtaobao.rebuy", prop)
model.clearThreshold()
val scoreAndLabels = test.map{point =>
val score = model.predict(point.features)
score+" "+point.label
}
val rebuyRDD = scoreAndLabels.map(_.split(" "))
val schema = StructType(List(StructField("score", StringType, true),StructField("label", StringType, true)))
val rowRDD = rebuyRDD.map(p => Row(p(0).trim, p(1).trim))
val rebuyDF = spark.createDataFrame(rowRDD, schema)
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "123")
prop.put("driver","com.mysql.jdbc.Driver")
rebuyDF.write.mode("append").jdbc("jdbc:mysql://localhost:3306/dbtaobao", "dbtaobao.rebuy", prop)
- 因为是Arm架构,这里数据传输打MySQL上出了点问题