package mlib
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
object UsefulRddOpts {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.appName("UseFulRddOpts")
.getOrCreate()
val sc = spark.sparkContext
showDiffReduceByKeyAndGroupByKey(sc)
}
/**
* 尽量使用reduceByKey,reduceByKey 会在每个partitiongs 上先进性聚合,可以减少数据
* 的传输,可以理解为在Mapper 的的时候进行了数据压缩。
* @param sc spark 上下文环境
*/
def showDiffReduceByKeyAndGroupByKey(sc : SparkContext) = {
val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
(3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
val result01 = data.reduceByKey(_ + ":" + _)
val result02 = data.groupByKey()
result01.foreach(println)
result02.foreach(println)
}
/**
* reduce 用法
* @param sc hello,
*/
def showUseOfReduce(sc : SparkContext) = {
val data = sc.parallelize(Array("123", "456", "789"), 1)
val reduce01 = data.reduce((str01, str02) => str01 + str02)
val reduce02 = data.reduce(_ + _)
println(reduce01 + ":" + reduce02)
}
/**
* keyBy 的用法,给每个数据分配一个key
* @param sc spark 上下文
*/
def showUseOfKeyBy(sc : SparkContext) = {
val data = sc.parallelize(Array("123dlalda", "hello,world", "hi, man."))
val result = data.keyBy(str => str.length)
result.foreach(println)
}
/**
* groupBy 和groupByKey 的用法
* @param sc spark 上下文环境
*/
def showUseOfGroypByAndGroupByKey(sc : SparkContext) = {
val data = sc.parallelize(Array(1,2,3,4,5,6,7,8), 1)
val result01 = data.groupBy(x => {if(x > 3) "gt3" else "lte3"})
result01.foreach(println)
val data02 = sc.parallelize(1 to 9)
val result02 = data02.groupBy(x => {if(x % 2 == 0) "double" else "single"})
result02.foreach(println)
val data03 = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
(3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
val result03 = data03.groupByKey()
result03.foreach(println)
}
/**
* 展示map 和flatMap 的用法
* @param sc
*/
def showUseOfMapAndFlatMap(sc : SparkContext) = {
val data = sc.parallelize(Array("hello,world", "bilibili,acfun", "hehe,intresting"), 1)
val map = data.map(x => x.split(","))
map.foreach(x => {println(x(0) +" " + x(1))})
val flatMap = data.flatMap(x => x.split(","))
flatMap.foreach(println)
val data01 = sc.parallelize(Array(1, 2, 3, 4, 5))
val result01 = data01.map(x => List(x+1)).collect
val result02 = data01.flatMap(x => List(x+1)).collect
}
/**
* filter 的用法, 注意保留符合filter 条件的内容
* @param sc spark 上下文对象
*/
def showUseOfFilter(sc : SparkContext) = {
val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
(3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
val result = data.filter(item => "hh".equals(item._2))
result.foreach(println)
}
/**
* 去重操作
* @param sc spark 上下文对象
*/
def showUseOfDistinct(sc : SparkContext) = {
val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
(3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
val result = data.distinct()
result.foreach(println)
}
/**
* countByKey 的用法
* @param sc spark 上下文对象
*/
def showUseOfCountByKey(sc : SparkContext) = {
val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
(3, "susum"), (4, "nina"), (1, "hh")))
val result = data.countByKey()
result.foreach(println)
}
/**
* countByValue, 统计各个Value 的个数
* @param sc spark 上下文
*/
def showUseOfCountByValue(sc : SparkContext) = {
val data = sc.parallelize(List(0, 1, 2, 3, 0, 3, 4, 5,6, 7), 1)
val result = data.countByValue()
result.foreach(println)
}
/**
* repartition 的用法, 分区后的rdd 需要用另一个rdd 进行装载
* @param sc sparkContext
*/
def showUseOfRepartition(sc : SparkContext) = {
val data = sc.parallelize(Array(1, 2, 3, 4, 5, 6))
println(data.partitions.length)
val data01 = data.repartition(5)
println(data01.partitions.length)
val data02 = data.repartition(3)
println(data02.partitions.length)
}
/**
* def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope
* {
* coalesce(numPartitions, shuffle = true)
* }
* 重新分片,重新分区, 注意特例repartition
* @param sc spark 上下文对象
* @param spark 2.X 后的新的api
*/
def showUseOfCoalesce(spark : SparkSession, sc : SparkContext) = {
var data = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
data.cache()
val result01 = data.aggregate(0)(math.max(_, _), _ + _)
println(result01)
val data02 = data.coalesce(2)
val result02 = data02.aggregate(0)(math.max(_, _), _ + _)
println(result02)
val data03 = data.repartition(2)
val result03 = data03.aggregate(0)(math.max(_, _), _ + _ )
println(result03)
}
/**
* 笛卡尔积的用法
* @param sc SparkContext, spark 程序提交入口
*/
def showUseOfCartessian(sc : SparkContext) = {
val data01 = sc.parallelize(Array(1, 2, 3, 4), 1)
val data02 = sc.parallelize(Array(5, 6), 1)
val result = data01.cartesian(data02)
result.foreach(println)
val result02 = data02.cartesian(data01)
result02.foreach(println)
}
/**
* cache 用法 (persist 的一种特例)
* @param spark SparkSession 对象,2.0 后的新的api
* @param sc spark 上下文
*/
def showUseOfCache(spark : SparkSession,sc : SparkContext) = {
import spark.implicits._
val data = sc.parallelize(List(1, 2 ,3, 4, 5, 6)).toDF()
data.cache()
val count = data.count()
println("---------------------------------------------------")
data.show()
}
/**
* 展示aggregate 函数的作用
* @param sc spark 上下文
*/
def showUseOffAggregate(sc : SparkContext) = {
val data01 = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8))
val result01 = data01.aggregate(0)(math.max(_, _), _ + _)
println(result01)
val data02 = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8), 1)
val result02 = data02.aggregate(0)(math.max(_, _), _ + _)
println(result02)
}
}