1. 读取SparkSQL的数据进行统计实战
import org.apache.spark.mllib.linalg.{
Vector, Vectors}
import org.apache.spark.mllib.stat.{
MultivariateStatisticalSummary, Statistics}
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
object _01SpetalLengthStaticesDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("IrisSparkCoreLoader").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
val datapath="C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\length.csv"
val data: RDD[Vector] = sc.textFile(datapath).map(_.toDouble).map(x=>Vectors.dense(x))
val stats: MultivariateStatisticalSummary = Statistics.colStats(data)
println("states nonzeros:",stats.numNonzeros)
println("states min:",stats.min)
println("states max:",stats.max)
println("states mean:",stats.mean)
println("states varience:",stats.variance)
}
}
import org.apache.spark.mllib.linalg.{
Vector, Vectors}
import org.apache.spark.mllib.stat.{
MultivariateStatisticalSummary, Statistics}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object _02irisDataStaticesDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val path = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\iris.data"
val data: RDD[Vector] = spark.sparkContext.textFile(path).map(x=>(x.split(",")(0)))
.map(_.toDouble)
.map(x=>Vectors.dense(x))
val stats: MultivariateStatisticalSummary = Statistics.colStats(data)
println("states nonzeros:",stats.numNonzeros)
println("states min:",stats.min)
println("states max:",stats.max)
println("states mean:",stats.mean)
println("states varience:",stats.variance)
val data1: RDD[Double] =spark.sparkContext.textFile(path).map(x=>x.split(",")(0)).map(_.toDouble)
val data2: RDD[Double] = spark.sparkContext.textFile(path).map(x=>x.split(",")(2)).map(_.toDouble)
val corr1: Double = Statistics.corr(data1,data2)
println("data1 and data2 corr value is:",corr1)
}
}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.{
DataFrame, SparkSession}
object _03IrisSparkSQLStaticesDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val path = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\iris.csv"
val valueDF: DataFrame = spark.read.format("csv")
.option("header", "true")
.option("inferschema", true)
.load(path)
valueDF.printSchema()
valueDF.show()
val vec: VectorAssembler = new VectorAssembler()
.setInputCols(Array("sepal_length", "sepal_width", "petal_length", "petal_width"))
.setOutputCol("feaures")
val vecResult: DataFrame = vec.transform(valueDF)
val corr: DataFrame = Correlation.corr(vecResult, "feaures", "pearson")
println("corr matrix is:")
corr.show(false)
}
}
2.特征工程实践
- 1-对数据有敏感性(搞大数据要对数据有想法)
- 2-特征工程分类
- 特征抽取
- 特征选择
- 特征转换-----重要
- 特征降维
特征工程案例:
Iris部分数据集展示:
![关于SparkMllib特征工程的案例详解(自己看的)_第1张图片](http://img.e-com-net.com/image/info8/8e88ef766d0d4ad69c5ba7c3d83c54e4.jpg)
object IrisSparkSQLFeaturesEngineer {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("IrisSparkCoreLoader").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
val datapath = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\iris.csv"
val data: DataFrame = spark.read.format("csv").option("header", "true").option("inferschema", true).load(datapath)
data.printSchema()
data.show(false)
val strIndex: StringIndexer = new StringIndexer().setInputCol("class").setOutputCol("labelclass")
val strModel: StringIndexerModel = strIndex.fit(data)
val strResult: DataFrame = strModel.transform(data)
strResult.show(false)
data.select("sepal_length").show(false)
data.select($"sepal_length").show(false)
data.select(col("sepal_length"))
data.select($"sepal_length",col("sepal_width")).show(false)
val vec: VectorAssembler = new VectorAssembler()
.setInputCols(Array("sepal_length","sepal_width","petal_length","petal_width"))
.setOutputCol("features")
val vecResult: DataFrame = vec.transform(data)
val chi: ChiSqSelector = new ChiSqSelector().setFeaturesCol("features").setLabelCol("class").setNumTopFeatures(3)
val chiModel: ChiSqSelectorModel = chi.fit(vecResult)
val chiResult: DataFrame = chiModel.transform(vecResult)
chiResult.show(false)
println("pca transfomation:")
val pca: PCA = new PCA().setInputCol("features").setOutputCol("pca_features").setK(2)
val pcaModel: PCAModel = pca.fit(vecResult)
pcaModel.transform(vecResult).show(false)
}
}