*本系列”spark mllib机器学习”,均以最新spark2.3.0版本为蓝本进行编写,参考书籍<< spark mllib机器学习 >>黄美灵版,转载请注明出处
GitHub地址:https://github.com/future-fu/learnsparkmllib
1.部分spark rdd操作
选取了几个不太常用Rdd算子,但是又比较有意思的,代码如下
package ai.lifecode.datahandle
import ai.lifecode.utils.ShowRddDetails
import org.apache.spark.sql.SparkSession
/**
* Created by future_fu on 2018/8/31 16:05.
*/
object RDDHandle {
def main(args: Array[String]): Unit = {
val spark=SparkSession.builder()
.master("local[3]")
.appName("rdd handle")
.getOrCreate()
val sc=spark.sparkContext
//zip 1对1合并操作
val zipRdd1=sc.parallelize(Array(1,2,3,4),3)
val zipRdd2=sc.parallelize(Array("a","b","c","d"),3)
val zipRdd=zipRdd1.zip(zipRdd2)
println()
ShowRddDetails.showRDD(zipRdd,"zipRdd",true)
//subtract减法操作
val subRdd1=sc.parallelize(1 to 9,3)
val subRdd2=sc.parallelize(1 to 3,3)
val subRdd=subRdd1.subtract(subRdd2)
ShowRddDetails.showRDD(subRdd,"subRdd",true)
//randomSplit 根据权重分割
val randomSplitRdd1=sc.parallelize(1 to 9 ,3)
val randomSplitRdd=randomSplitRdd1.randomSplit(Array(0.2,0.3,0.2,0.3))
ShowRddDetails.showRDD(randomSplitRdd(0),"randomSplitRdd 0",true)
ShowRddDetails.showRDD(randomSplitRdd(1),"randomSplitRdd 1",true)
ShowRddDetails.showRDD(randomSplitRdd(2),"randomSplitRdd 2",true)
ShowRddDetails.showRDD(randomSplitRdd(3),"randomSplitRdd 3",true)
}
}
运行结果:
=== [class ai.lifecode.utils.ShowRddDetails]: zipRdd ===
(1,a)(2,b)(3,c)(4,d)
=== [class ai.lifecode.utils.ShowRddDetails]: subRdd ===
694758
=== [class ai.lifecode.utils.ShowRddDetails]: randomSplitRdd 0 ===
8
=== [class ai.lifecode.utils.ShowRddDetails]: randomSplitRdd 1 ===
1237
=== [class ai.lifecode.utils.ShowRddDetails]: randomSplitRdd 2 ===
4
=== [class ai.lifecode.utils.ShowRddDetails]: randomSplitRdd 3 ===
569
2.mllib Statistics统计操作
代码如下:
数据源 sample_stat.txt
1 2 3 4 5
6 7 1 5 9
3 5 6 3 1
3 1 1 5 6
package ai.lifecode.datahandle
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.sql.SparkSession
/**
* Created by future_fu on 2018/8/31 16:52.
*/
object StatisticsData {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[3]")
.appName("rdd handle")
.getOrCreate()
val sc = spark.sparkContext
val path = "data/sample_stat.txt"
val data = sc.textFile(path)
.map(_.split("\t"))
.map(f => f.map(f => f.toDouble))
.map(f => Vectors.dense(f))
//1.计算每列最大值,最小值,平均值,方差值,L1范数,L2范数
val stat1 = Statistics.colStats(data)
println("☆┈━═┈━═┈━═┈━═┈━═☆常用计算☆┈━═┈━═┈━═┈━═┈━═☆")
println(stat1.max)
println(stat1.min)
println(stat1.mean)
println(stat1.variance)
println(stat1.normL1)
println(stat1.normL2)
//2.相关系数,取值范围[-1,1] 0不相关 [-1,0]负相关 [0,1]正相关
val corr1=Statistics.corr(data,"pearson")
val corr2=Statistics.corr(data,"spearman")
val x1=sc.parallelize(Array(1.0,2.0,3.0,4.0))
val x2=sc.parallelize(Array(5.0,6.0,6.0,6.0))
val corr3=Statistics.corr(x1,x2,"pearson")
println("☆┈━═┈━═┈━═┈━═┈━═☆相关系数☆┈━═┈━═┈━═┈━═┈━═☆")
println(corr1)
println(corr2)
println(corr3)
//3.假设检验,mllib支持用于判断拟合度或者独立性的Pearson检验.
// 不同的输入类型决定了是做拟合度检验还是独立性检验.
// 拟合度输入Vector,独立性输入Matrix
val v1=Vectors.dense(43.0,9.0)
val v2=Vectors.dense(44.0,4.0)
val c1=Statistics.chiSqTest(v1,v2)
println("☆┈━═┈━═┈━═┈━═┈━═☆假设检验☆┈━═┈━═┈━═┈━═┈━═☆")
println(v1)
println(v2)
println(c1)
}
}
结果如下:
☆┈━═┈━═┈━═┈━═┈━═☆常用计算☆┈━═┈━═┈━═┈━═┈━═☆
[6.0,7.0,6.0,5.0,9.0]
[1.0,1.0,1.0,3.0,1.0]
[3.25,3.75,2.75,4.25,5.25]
[4.25,7.583333333333333,5.583333333333333,0.9166666666666666,10.916666666666666]
[13.0,15.0,11.0,17.0,21.0]
[7.416198487095663,8.888194417315589,6.855654600401044,8.660254037844387,11.958260743101398]
☆┈━═┈━═┈━═┈━═┈━═☆相关系数☆┈━═┈━═┈━═┈━═┈━═☆
1.0 0.7779829610026362 -0.39346431156047523 ... (5 total)
0.7779829610026362 1.0 0.14087521363240252 ...
-0.39346431156047523 0.14087521363240252 1.0 ...
0.4644203640128242 -0.09482093118615205 -0.9945577827230707 ...
0.5750122832421579 0.19233705001984078 -0.9286374704669208 ...
1.0 0.632455532033675 -0.5000000000000001 ... (5 total)
0.632455532033675 1.0 0.10540925533894883 ...
-0.5000000000000001 0.10540925533894883 1.0 ...
0.5000000000000001 -0.10540925533894883 -1.0000000000000002 ...
0.6324555320336723 0.20000000000000429 -0.9486832980505085 ...
0.7745966692414775
☆┈━═┈━═┈━═┈━═┈━═☆假设检验☆┈━═┈━═┈━═┈━═┈━═☆
[43.0,9.0]
[44.0,4.0]
Chi squared test summary:
method: pearson
degrees of freedom = 1
statistic = 5.482517482517483
pValue = 0.01920757707591003
Strong presumption against null hypothesis: observed follows the same distribution as expected..
3.mllib的数据格式
util包中包含了如下object
MLUtils针对MLlib相关算法进行ETL操作,提供数据.主要方法有:loadLibSVMFile,saveAsLibSVMFile,appendBias,fastSquaredDistance
KMeansDataGenerator 是生成k-means样本模型,方法只有:generateKMeansRDD
LogisticRegressionDataGenerator 逻辑回归的模型,方法只有:generateLogisticRDD
SVMDataGenerator 封装了SVM样本生成方法
MFDataGenerator 封装了MFD样本生成方法
3.1 loadLibSVMFile 加载指定LIBSVM格式文件的方法,根据源码的注释可以了解到
package ai.lifecode.datahandle
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
/**
* Created by future_fu on 2018/9/3 11:42.
*/
object DataFormat {
def main(args: Array[String]): Unit = {
val spark=SparkSession.builder()
.master("local[3]")
.appName("rdd handle")
.getOrCreate()
val sc=spark.sparkContext
//1.loadLibSVMFile 加载指定LIBSVM格式文件方法
val path="data/sample_libsvm_data.txt"
val data=MLUtils.loadLibSVMFile(sc,path)
println(data.collect().length)
println(data.take(2).toList)
}
}
结果如下
100
List((0.0,(692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0,114.0,253.0,228.0,47.0,79.0,255.0,168.0,48.0,238.0,252.0,252.0,179.0,12.0,75.0,121.0,21.0,253.0,243.0,50.0,38.0,165.0,253.0,233.0,208.0,84.0,253.0,252.0,165.0,7.0,178.0,252.0,240.0,71.0,19.0,28.0,253.0,252.0,195.0,57.0,252.0,252.0,63.0,253.0,252.0,195.0,198.0,253.0,190.0,255.0,253.0,196.0,76.0,246.0,252.0,112.0,253.0,252.0,148.0,85.0,252.0,230.0,25.0,7.0,135.0,253.0,186.0,12.0,85.0,252.0,223.0,7.0,131.0,252.0,225.0,71.0,85.0,252.0,145.0,48.0,165.0,252.0,173.0,86.0,253.0,225.0,114.0,238.0,253.0,162.0,85.0,252.0,249.0,146.0,48.0,29.0,85.0,178.0,225.0,253.0,223.0,167.0,56.0,85.0,252.0,252.0,252.0,229.0,215.0,252.0,252.0,252.0,196.0,130.0,28.0,199.0,252.0,252.0,253.0,252.0,252.0,233.0,145.0,25.0,128.0,252.0,253.0,252.0,141.0,37.0])), (1.0,(692,[158,159,160,161,185,186,187,188,189,213,214,215,216,217,240,241,242,243,244,245,267,268,269,270,271,295,296,297,298,322,323,324,325,326,349,350,351,352,353,377,378,379,380,381,404,405,406,407,408,431,432,433,434,435,459,460,461,462,463,486,487,488,489,490,514,515,516,517,518,542,543,544,545,569,570,571,572,573,596,597,598,599,600,601,624,625,626,627,652,653,654,655,680,681,682,683],[124.0,253.0,255.0,63.0,96.0,244.0,251.0,253.0,62.0,127.0,251.0,251.0,253.0,62.0,68.0,236.0,251.0,211.0,31.0,8.0,60.0,228.0,251.0,251.0,94.0,155.0,253.0,253.0,189.0,20.0,253.0,251.0,235.0,66.0,32.0,205.0,253.0,251.0,126.0,104.0,251.0,253.0,184.0,15.0,80.0,240.0,251.0,193.0,23.0,32.0,253.0,253.0,253.0,159.0,151.0,251.0,251.0,251.0,39.0,48.0,221.0,251.0,251.0,172.0,234.0,251.0,251.0,196.0,12.0,253.0,251.0,251.0,89.0,159.0,255.0,253.0,253.0,31.0,48.0,228.0,253.0,247.0,140.0,8.0,64.0,251.0,253.0,220.0,64.0,251.0,253.0,220.0,24.0,193.0,253.0,220.0])))
3.2 saveAsLibSVMFile
将LIBSVM格式的数据保存到指定文件中
3.3 appendBias
对向量增加偏置项,用于回归和分类的算法计算中
3.4 fastSquaredDistance
快速计算距离,返回欧式距离,用于k-means
☆┈━═┈━═┈━═┈━═┈━═☆因为您的赞赏,您会读到更多优质文章☆┈━═┈━═┈━═┈━═┈━═☆