此部分主要关于MLlib的基础数据结构
import org.apache.spark.mllib.linalg.{Vector, Vectors} //创建一个稠密向量 val dv : Vector = Vector.dense(1.0,0.0,3.0); //创建一个稀疏向量(第一种方式) val sv1: Vector = Vector.sparse(3, Array(0,2), Array(1.0,3.0)); //创建一个稀疏向量(第二种方式) val sv2 : Vector = Vector.sparse(3, Seq((0,1.0),(2,3.0)))
import org.apache.spark.mllib.linag.Vectors import org.apache.spark.mllib.regression.LabeledPoint val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))对于pos变量,第一个参数1.0的具体含义只有你自己知道咯,可以使行索引,可以使特殊值神马的
label index1:value1 index2:value2.....然后通过
val test : RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "path")直接读入即可。
import org.apache.spark.mllib.linalg.{Matrix, Matrices} val dm : Matrix = Matrices.dense(3,2, Array(1.0,3.0,5.0,2.0,4.0,6.0))
1.0 | 2.0 |
3.0 | 4.0 |
5.0 | 6.0 |
import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.distributed.RowMatrix val rows: RDD[Vector] = ...// val mat: RowMatrix = new RowMatrix(rows) val m = mat.numRows() val n = mat.numCols()
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.stat.MultivariateStatisticalSummary val mat: RowMatrix = .. val summy : MultivariateStatisticalSummary = mat.computeColumnSummaryStatistics() println(summy.mean)//平均数
import org.apache.spark.mllib.linalg.distributed.{CoordinatedMatrix, MatrixEntry} val entries : RDD[MatrixEntry] = .. val mat: CoordinateMatrix = new CoordinateMatrix(entries)