sc: SparkContext
summary statistics 概要统计
val array: Array[Double] = Array[Double](1.0, 0.0, 3.0)
val array2: Array[Double] = Array[Double](1.1, 0.0, 3.2)
val array3: Array[Double] = Array[Double](2.0, 0.0, 3.0)
val dv: Vector = Vectors.dense(array)
val dv2: Vector = Vectors.dense(array2)
val dv3: Vector = Vectors.dense(array3)
val seqDv: Seq[Vector] = Seq(dv, dv2, dv3)
val observations: RDD[Vector] = sc.parallelize[Vector](seqDv)
val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
println(summary.mean) // a dense vector containing the mean value for each column
println(summary.variance) // column-wise variance
println(summary.numNonzeros) // number of nonzeros in each column;
correlations 相关性分析
val seriesXList = List(9.5, 342, 23, 23, 43, 234, 34, 12, 6, 6)
val seriesYList = List(9.234, 34, 2, 2, 3, 34, 4, 2, 69, 63)
val dv1: Vector = Vectors.dense(1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1)
val dv2: Vector = Vectors.dense(1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2)
val dv3: Vector = Vectors.dense(1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 9.3, 10.3)
val dv4: Vector = Vectors.dense(1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4, 9.4, 10.4)
val seriesX: RDD[Double] = sc.parallelize(seriesXList) // a series
val seriesY: RDD[Double] = sc.parallelize(seriesYList) // must have the same number of partitions and cardinality as seriesX
val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
val observations = sc.parallelize(List(dv1, dv2, dv3, dv4))
// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
// If a method is not specified, Pearson's method will be used by default.
val correlMatrix: Matrix = Statistics.corr(observations, "pearson")
println(correlation)
println(correlMatrix)
stratified sampling 分层抽样
val rawdata: Seq[String] = Seq("test", "test", "test", "test", "test", "test", "test", "raw")
val predata: RDD[String] = sc.parallelize(rawdata)
// val data : RDD[(K, V)] = ... // an RDD[(K, V)] of any key value pairs
val data: RDD[(String, Double)] = predata.map(s => (s, 0.7))
// val fractions: Map[K, Double] =... // specify the exact fraction desired from each key
val fractions: Map[String, Double] = Map("test" -> 0.5, "raw" -> 1.0)
// Get an exact sample from each stratum
val approxSample = data.sampleByKey(withReplacement = false, fractions, 1L)
val exactSample = data.sampleByKeyExact(withReplacement = false, fractions, 1L)
Clustering 聚类算法
val data = sc.parallelize(Seq(
Vectors.dense(1.0, 2.0, 6.0),
Vectors.dense(1.0, 3.0, 0.0),
Vectors.dense(1.0, 4.0, 6.0)
))
val center = Vectors.dense(1.0, 3.0, 4.0)
var model = KMeans.train(data, k = 3, maxIterations = 1)
println(model.clusterCenters)
random data generation 随机数算法
val u = normalRDD(sc, 1000000L, 10)
val v = u.map(x => 1.0 + 2.0 * x)