使用spark mllib实现lineaRegression的Demo

package mlib.demo.linearegression
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LinearRegressionWithSGD, LabeledPoint}
import org.apache.spark.{SparkContext, SparkConf}


/**
 * Created by master on 3/1/18.
 */
object LineaRegression {
  def main(args: Array[String]) {
    //shiled not necessary info on the terminal
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)


    //set run environment
    val conf = new SparkConf().setAppName("LineaRegression").setMaster("local")
    val sc = new SparkContext(conf)


    // load data and parse
    /*
1 -0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.86317118542594    5 -1.04215728919298 -0.864466507337306
2 -0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425    945 -1.04215728919298 -0.864466507337306
*/


    val data = sc.textFile("/opt/data/data4linearegression")
    val parsedData = data.map { line =>
      var parts = line.split(",")


      // character into dense vector
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }


    //construct model
    val numIeterations = 100
    //参数估计最大似然用的比较多,这里采用随机梯度下降法随机梯度下降法(Stochastic Gradient Descent,SGD,
    //也作增量梯度下降法:Incremental Gradient Method, IGM)
    //sgd解决了梯度下降的两个问题: 收敛速度慢和陷入局部最优


    val model = LinearRegressionWithSGD.train(parsedData, numIeterations)
    //judge model and compute errors
    val valuesAndPreds = parsedData.map {
      point =>
        val prediction = model.predict(point.features)
        (point.label, prediction)
    }
    val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.reduce(_ + _) / valuesAndPreds.count()


    println("trainning Mean Squared Error = " + MSE)
    println(model.weights + "-------------------")
    sc.stop()
  }


}

你可能感兴趣的:(scala,MachineLearning,spark)