数据源(file/data/mllib/input/ridge-data/defDemo1):
42,0.10
43.5,0.11
45,0.12
45.5,0.13
45,0.14
47.5,0.15
49,0.16
53,0.17
50,0.18
55,0.20
55,0.21
60,0.23
代码:
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{ LinearRegressionWithSGD, LabeledPoint }
import org.apache.spark.{ SparkConf, SparkContext }
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName(this.getClass().getSimpleName().filter(!_.equals('$')))
val sc=new SparkContext(conf)
val data=sc.textFile("file/data/mllib/input/ridge-data/defDemo1")//获取数据集路径
val parsedData=data.map { line =>//开始对数据集处理
val parts=line.split(',')//根据逗号进行分区
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).trim().split(' ').map(_.toDouble)))
}//转化数据格式
//val parsedData = data.map { line => //开始对数据集处理
//val parts = line.split(',') //根据逗号进行分区
//LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).trim().split(' ').map(d=>{
//(d.toDouble-0.10)/(0.23-0.10) //归一化 (x-minX)/(maxX-minX)
//})))
//} //转化数据格式 归一化无效果
parsedData.foreach(line=>{
println(line.label+" , "+line.features)
})
val model= LinearRegressionWithSGD.train(parsedData,1000,0.001)//建立模型
val result=model.predict(Vectors.dense(0.19))//通过模型预测模型
println("model weights:")
println(model.weights)
println("model intercept:")
println(model.intercept)
println("result:")
println(result)//打印预测结果
sc.stop
}
运行结果:
model weights:
[0.11670307429843765]
model intercept:
0.0
result:
0.022173584116703154
实际线性函数(y=mx+n)应该接近:y=130.835x + 28.493
当x=0.19 时,y=53.35
LinearRegressionWithSGD 执行的结果跟实际结果函数对不上.....
相应的数据R语言执行的结果: