package com.meng.nan.day717
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, Partitioner, SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag
//操作算子
object TransformationClass1 {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.spark_project").setLevel(Level.WARN)
//groupByKey的使用
val conf =new SparkConf()
.setAppName("ActionClass1")
.setMaster("local[*]")
val sc =new SparkContext(conf)
//gbk(sc)
//rdOps2(sc)
// distinctOps(sc)
//cbk2Ops(sc)
//gbk2Ops(sc)
//Agbk2Ops(sc)
agby2Ops(sc)
}
def gbk(sc: SparkContext): Unit = {
val stu =List(
"1 刘天天 22 bd-bj",
"2 常龙 25 bd-bj",
"3 张熹 24 bd-sz",
"4 胡盼盼 18 bd-wh",
"5 周安动 18 bd-wh",
"4 张震 18 bd-sz"
)
//按班级进行分组
val stuRDD: RDD[String] = sc.parallelize(stu)
val class2Info: RDD[((String,String))] = stuRDD.map(stuLine => {
val cl = stuLine.substring(stuLine.indexOf("bd"))
val info = stuLine.substring(0, stuLine.indexOf("bd")).trim
(cl, info)
})
val gbkRDD: RDD[(String,Iterable[String])] = class2Info.groupByKey()
gbkRDD.foreach {case (cl, info) => {
println(s"${cl}\t${info}")
}
}
println("=============")
//ClassTag类型标记接口
val gbRDD: RDD[(String,Iterable[String])] = stuRDD.groupBy(stLine => {
stLine.substring(stLine.indexOf("bd"))
})(
ClassTag.Object.asInstanceOf[ClassTag[String]]
)
gbRDD.foreach {case (cl, info) => {
println(s"${cl}\t${info}")
}
}
}
// /**
// *
// * bd-wh CompactBuffer(4 胡盼盼 18, 5 周安动 18)
// * bd-sz CompactBuffer(3 张熹 24, 4 张震 18)
// * bd-bj CompactBuffer(1 刘天天 22, 2 常龙 25)
// * =============
// * bd-bj CompactBuffer(1 刘天天 22 bd-bj, 2 常龙 25 bd-bj)
// * bd-sz CompactBuffer(3 张熹 24 bd-sz, 4 张震 18 bd-sz)
// * bd-wh CompactBuffer(4 胡盼盼 18 bd-wh, 5 周安动 18 bd-wh)
// **/
def rdOps2(sc: SparkContext): Unit = {
val list =List(
"yan xin xin",
"zhang zhao ping"
)
val listRDD: RDD[String] = sc.parallelize(list)
val wordsRDD = listRDD.flatMap(line => line.split("\\s+"))
val wordsMapRDD: RDD[(String, Int)] = wordsRDD.map((_,1))
val rbkRDD: RDD[(String, Int)] = wordsMapRDD.reduceByKey(_ + _)
rbkRDD.foreach(println)
/**
* (xin,2)
* (zhao,1)
* (zhang,1)
* (ping,1)
* (yan,1)
*/
}
//distinct去重
def distinctOps(sc: SparkContext): Unit = {
val list =List(
"蔡金廷",
"蔡金廷",
"蔡金廷",
"蔡金廷"
)
val listRDD: RDD[String] = sc.parallelize(list)
//指定几个分区进行去重/也就是运行几个task
listRDD.distinct(4).foreach(println)
}
//自定义的combineByKey模拟reducervyKey和groupByKey
def cbk2Ops(sc: SparkContext) {
val list =List(
"hello you hello me",
"hello you shit me",
"hello you oh shit",
"me you hello me"
)
val listRDD: RDD[String] = sc.parallelize(list)
val pairsRDD: RDD[(String, Int)] = listRDD.flatMap(_.split("\\s+")).map((_,1))
//传统的reducerBYkey
pairsRDD.reduceByKey(_ + _).foreach(println)
//模拟实现
// pairsRDD.combineByKey(createCombiner,mergeValue,mergeCombiners).foreach(println)
pairsRDD.combineByKey((num: Int) => num, (sum1: Int, num: Int) => sum1 + num, (sum: Int, sum1: Int) => sum + sum1)
sc.stop()
}
//初始化聚合结果类型
def createCombiner(value: Int): Int = {
value
}
//分区间的聚合操作
def mergeCombiners(sum1: Int, sum2: Int): Int = {
sum1 + sum2
}
//分区内的聚合操作
def mergeValue(sum: Int, value: Int): Int = {
sum + value
}
//模拟实现groupByKey
def gbk2Ops(sc: SparkContext) {
val stu =List(
"1 李飞 22 bd-1901-bj",
"2 常龙 25 bd-1901-bj",
"8 汪锦奇 24 bd-1901-sz",
"3 张熹 24 bd-1901-sz",
"4 胡盼盼 18 bd-1901-wh",
"6 陈抒 25 bd-1901-bj",
"7 曹煜 25 bd-1901-bj",
"5 周联动 18 bd-1901-wh",
"4 张震 18 bd-1901-sz"
)
val stuRDD: RDD[String] = sc.parallelize(stu)
val class2Info: RDD[(String,String)] = stuRDD.map(stuLine => {
val id = stuLine.substring(stuLine.indexOf("bd"))
(id, stuLine)
})
val gbkRDD: RDD[(String,Iterable[String])] = class2Info.groupByKey()
// gbkRDD.foreach { case (cl, info) => {
// println(s"${cl}\t${info}")
// }
// }
//模拟实现
class2Info.combineByKey(
(info:String) =>createCombiner1(info), (buffer: ArrayBuffer[String], info:String) =>
mergeValue1(buffer, info), (buffer1: ArrayBuffer[String], buffer2: ArrayBuffer[String]) =>
mergeCombiners1(buffer1, buffer2),new MyParationer(3))//该分区动作不是combineByKey数据输入进来是对数据做分区,经过combineByKey计算完毕之后将结果做分区
.saveAsTextFile("file:///D:/a")
sc.stop()
}
/*
初始化操作,确定聚合操作之后的结果类型
在每一个分区内相同的key,需要调用一次该操作,并将其中的一个元素用于初始化操作
*/
def createCombiner1(str:String): ArrayBuffer[String] = {
val ab = ArrayBuffer[String]()
ab.append(str)
ab
}
/**
* 分区内的相同key的聚合操作
*
*/
def mergeValue1(strings: ArrayBuffer[String], str:String): ArrayBuffer[String] = {
strings.append(str)
strings
}
/**
* 分区间的相同key的聚合操作
*/
def mergeCombiners1(strings: ArrayBuffer[String], strings1: ArrayBuffer[String]): ArrayBuffer[String] = {
strings ++ strings1
}
//aggregateByKey模拟groupByKey
def Agbk2Ops(sc:SparkContext): Unit = {
val stu =List(
"1 李飞 22 bd-1901-bj",
"2 常龙 25 bd-1901-bj",
"8 汪锦奇 24 bd-1901-sz",
"3 张熹 24 bd-1901-sz",
"4 胡盼盼 18 bd-1901-wh",
"6 陈抒 25 bd-1901-bj",
"7 曹煜 25 bd-1901-bj",
"5 周联动 18 bd-1901-wh",
"4 张震 18 bd-1901-sz"
)
val stuRDD: RDD[String] = sc.parallelize(stu)
val clazz2Info: RDD[(String,String)] = stuRDD.map(stuLine => {
val clazz = stuLine.substring(stuLine.indexOf("bd"))
(clazz, stuLine)
})
val gbkRDD: RDD[(String,Iterable[String])] = clazz2Info.groupByKey()
gbkRDD.foreach {case (clazz, infos) => {
println(s"${clazz} ---> ${infos}")
}
}
println("======================")
val abk=clazz2Info.aggregateByKey(ArrayBuffer[String]())(seqOP,combOp)
abk.foreach(println)
}
def seqOP(ab:ArrayBuffer[String],info:String):ArrayBuffer[String]={
ab.append(info)
ab
}
def combOp(ab1:ArrayBuffer[String],ab2:ArrayBuffer[String]):ArrayBuffer[String]={
ab1++ab2
}
//用aggregateByKey模拟reducerBykey
def agby2Ops(sc:SparkContext): Unit ={
val list =List(
"hello you hello me",
"hello you shit me",
"hello you oh shit",
"me you hello me"
)
val listRDD:RDD[String]=sc.parallelize(list)
val pairsRDD:RDD[(String,Int)]=listRDD.flatMap(_.split("\\s+")).map((_,1))
pairsRDD.reduceByKey(_+_).foreach(println)
println("=============")
//模拟实现
pairsRDD.aggregateByKey(0)(seqOp1,combOp1).foreach(println)
}
def seqOp1(sum:Int,a:Int):Int={
sum+a
}
def combOp1(sum1:Int,sum2:Int):Int ={
sum1+sum2
}
}
//分区
class MyParationer(partitions: Int)extends Partitioner {
override def numPartitions: Int = partitions
override def getPartition(key: Any) = {
val ret = keymatch {
case clazz:String => {
val city = clazz.substring(clazz.lastIndexOf("-" )+1)
citymatch {
case "wh" =>0
case "bj" =>1
case "sz" =>2
}
}
case _ =>0
}
ret
}
}
/**
* val wordCountsWithReduce = wordPairsRDD
* .reduceByKey(_ + _)
* .collect()
*
* val wordCountsWithGroup = wordPairsRDD
* .groupByKey()
* .map(t => (t._1, t._2.sum))
* .collect()
*
*/