package org.apache.spark.examples.mllib
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
object SummaryStatisticsExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("SummaryStatisticsExample")
val sc = new SparkContext(conf)
val observations = sc.parallelize(
Seq(
Vectors.dense(1.0, 10.0, 100.0),
Vectors.dense(2.0, 20.0, 200.0),
Vectors.dense(3.0, 30.0, 300.0)
)
)
// Compute column summary statistics.
val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
println(summary.mean) // 平均值
println(summary.variance) // 列方差
println(summary.numNonzeros) // 非零个数
sc.stop()
}
}
def testDataFrame() = {
val df = spark.read.option("header", true).csv("hdfs://192.168.179.14:8020/mlDataSet/catering_sale.csv")
val df1 = df.select(df("sale_date").cast("String"), df("sale_amt").cast("Double"))
df1.printSchema()
df1.show(10)
df1.createOrReplaceTempView("catering_sale")
df1.describe().show()
}
package org.apache.spark.examples.ml
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.Summarizer
import org.apache.spark.sql.SparkSession
object SummarizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.appName("SummarizerExample")
.getOrCreate()
val data = Seq(
(Vectors.dense(2.0, 3.0, 5.0), 1.0),
(Vectors.dense(4.0, 6.0, 7.0), 2.0)
)
import spark.implicits._
val df = data.toDF("features", "weight")
import Summarizer._
val (meanVal, varianceVal) = df.select(metrics("mean", "variance")
.summary($"features", $"weight").as("summary"))
.select("summary.mean", "summary.variance")
.as[(Vector, Vector)].first()
println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}")
val (meanVal2, varianceVal2) = df.select(mean($"features"), variance($"features"))
.as[(Vector, Vector)].first()
println(s"without weight: mean = ${meanVal2}, sum = ${varianceVal2}")
spark.stop()
}
}
目前Spark支持两种相关性(correlations)系数:皮尔森相关系数(pearson)和斯皮尔曼等级相关系数(spearman)。相关系数用于反映变量之间相关关系程度的统计指标。 其取值[-1, 1]
, 当取值为0时表示不想关, 取值[-1, 0)
表示负相关,取值为(0, 1]
表示正相关。
Peason相关系数公式:
其中n为样本量。 Pearson相关系数反映两个数值变量的线性相关性,它一般适用于正态分布。
Spearman相关系数也用来表示两个变量的相关性,但是他没Pearson相关系数对变量分布的严格要求,
Spearman相关系数公式:
package org.apache.spark.examples.mllib
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
object CorrelationsExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("CorrelationsExample")
val sc = new SparkContext(conf)
val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series
// must have the same number of partitions and cardinality as seriesX
val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
// method is not specified, Pearson's method will be used by default.
val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
val correlation1: Double = Statistics.corr(seriesX, seriesY, "spearman")
println(s"Correlation is: $correlation")
println(s"Correlation1 is: $correlation1")
// Correlation is: 0.8500286768773007
// Correlation1 is: 1.0000000000000002
// val data: RDD[Vector] = sc.parallelize(
// Seq(
// Vectors.dense(1.0, 10.0, 100.0),
// Vectors.dense(2.0, 20.0, 200.0),
// Vectors.dense(5.0, 33.0, 366.0))
// ) // note that each Vector is a row and not a column
//
// // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method
// // If a method is not specified, Pearson's method will be used by default.
// val correlMatrix: Matrix = Statistics.corr(data, "pearson")
// println(correlMatrix.toString)
sc.stop()
}
}
package org.apache.spark.examples.ml
import org.apache.spark.ml.linalg.{Matrix, Vectors}
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
/**
* An example for computing correlation matrix.
* Run with
* {{{
* bin/run-example ml.CorrelationExample
* }}}
*/
object CorrelationExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.appName("CorrelationExample")
.getOrCreate()
import spark.implicits._
val data = Seq(
Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
Vectors.dense(4.0, 5.0, 0.0, 3.0),
Vectors.dense(6.0, 7.0, 0.0, 8.0),
Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
)
val df = data.map(Tuple1.apply).toDF("features")
val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
println(s"Pearson correlation matrix:\n $coeff1")
//
// val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
// println(s"Spearman correlation matrix:\n $coeff2")
spark.stop()
}
}