一、在看这个例子之前你需要:
1)稍稍懂一些Scala的语法
2)本地机器上有spark环境,最好安装了Hadoop
二、一个简单的LR分类模型
步骤1:处理数据成为LabeledPoint格式,参考:spark官网ml数据格式;一个简单明了的spark数据处理网上书籍
步骤2:调用Spark工具包执行算法,参考:spark官网逻辑回归实现
以下演示环境为spark-shell
scala> sc//spark-shell会默认创建一个sc变量,即SparkContext实例
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@b5de9ac
//读取数据
scala> val rdd1 = sc.textFile("hdfs://bipcluster/user/platform_user/jiping.liu/dataSpark.csv")
rdd1: org.apache.spark.rdd.RDD[String] = hdfs://bipcluster/user/platform_user/jiping.liu/dataSpark.csv MapPartitionsRDD[1] at textFile at :24
scala> rdd1.first()//spark 是惰性计算,只有遇到像first()这样的行动函数后才会执行计算,有点行tensorflow,
//第一个0表示label,之后表示features index:value的libsvm数据格式
res1: String = 0 0:0.14447325 1:24.5 2:184.433 3:291.9 4:0.0382946 5:8.142114 6:2.8 7:65.86893....
//数据处理
scala> :paste//成段编写spark-shell脚本的命令
// Entering paste mode (ctrl-D to finish)
val dataPoint = rdd1.map(line =>
{
val temp1 = line.split(" ")//Array[String]格式
val label = temp1(0).toDouble//提取label
val features = temp1.slice(1,temp1.length)//选取features数据
//下式中+1因为Spark ml的Labeledpoint数据格式要求feature的index编号从1开始即label 1:f1 2:f1 ..., 且是递增的
val indexs = features.map(x => x.split(":")(0).toInt+1)
val values = features.map(x => x.split(":")(1).toDouble)
//5000表示,特征的个数,也就是index的范围
new LabeledPoint(label,Vectors.sparse(5000,indexs,values))
}
//Exiting paste mode, now interpreting.
dataPoint: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[2] at map at :28
scala> dataPoint.first()
res2: org.apache.spark.mllib.regression.LabeledPoint =
(0.0,(5000,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,55,56,57,58,59,60,61....
//模型导入
scala> import org.apache.spark.mllib.classification.{LogisticRegressionModel,LogisticRegressionWithLBFGS}
scala> import org.apache.spark.mllib.evaluation.MulticlassMetrics
//数据集分割成train和test
scala> val splits = dataPoint.randomSplit(Array(0.6,0.4),seed = 11l)
splits: Array[org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint]] = Array(MapPartitionsRDD[3] at randomSplit at :32, MapPartitionsRDD[4] at randomSplit at :32)
scala> val train = splits(0)
scala> train.first()
res4: org.apache.spark.mllib.regression.LabeledPoint = (0.0,(5000,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,55,56,57,58,59,60,61,62,63,69,...
//模型训练
scala> val model = new LogisticRegressionWithLBFGS().setNumClasses(2).run(train)
18/06/29 19:23:08 WARN [com.github.fommil.netlib.BLAS(61) -- main]: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/06/29 19:23:08 WARN [com.github.fommil.netlib.BLAS(61) -- main]: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
model: org.apache.spark.mllib.classification.LogisticRegressionModel = org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeatures = 5000, numClasses = 2, threshold = 0.5
//模型测试评估
scala> :paste
// Entering paste mode (ctrl-D to finish)
val preAndtrue = test.map{
case LabeledPoint(label,features) =>
val prediction = model.predict(features)
(prediction,label)
}
// Exiting paste mode, now interpreting.
preAndtrue: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[113] at map at :40
scala> val metrics = new MulticlassMetrics(preAndtrue)
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@689f9dc8
scala> preAndtrue
res5: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[113] at map at :40
scala> preAndtrue.first
def first(): (Double, Double)
scala> preAndtrue.first()
res6: (Double, Double) = (0.0,0.0)
scala> val accuracy = metrics.accuracy
accuracy: Double = 0.885496183206106