spark的groupByKey、reduceByKey的算子简单使用

package com.meng.nan.day717

import org.apache.log4j.{Level, Logger}

import org.apache.spark.rdd.RDD

import org.apache.spark.{Partition, Partitioner, SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

import scala.reflect.ClassTag

//操作算子

object TransformationClass1 {

def main(args: Array[String]): Unit = {

Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)

Logger.getLogger("org.spark_project").setLevel(Level.WARN)

//groupByKey的使用

    val conf =new SparkConf()

.setAppName("ActionClass1")

.setMaster("local[*]")

val sc =new SparkContext(conf)

//gbk(sc)

//rdOps2(sc)

// distinctOps(sc)

//cbk2Ops(sc)

//gbk2Ops(sc)

//Agbk2Ops(sc)

    agby2Ops(sc)

}

def gbk(sc: SparkContext): Unit = {

val stu =List(

"1 刘天天 22 bd-bj",

"2 常龙 25 bd-bj",

"3 张熹 24 bd-sz",

"4 胡盼盼 18 bd-wh",

"5 周安动 18 bd-wh",

"4 张震 18 bd-sz"

    )

//按班级进行分组

    val stuRDD: RDD[String] = sc.parallelize(stu)

val class2Info: RDD[((String,String))] = stuRDD.map(stuLine => {

val cl = stuLine.substring(stuLine.indexOf("bd"))

val info = stuLine.substring(0, stuLine.indexOf("bd")).trim

(cl, info)

})

val gbkRDD: RDD[(String,Iterable[String])] = class2Info.groupByKey()

gbkRDD.foreach {case (cl, info) => {

println(s"${cl}\t${info}")

}

}

println("=============")

//ClassTag类型标记接口

    val gbRDD: RDD[(String,Iterable[String])] = stuRDD.groupBy(stLine => {

stLine.substring(stLine.indexOf("bd"))

})(

ClassTag.Object.asInstanceOf[ClassTag[String]]

)

gbRDD.foreach {case (cl, info) => {

println(s"${cl}\t${info}")

}

}

}

//  /**

//    *

//    * bd-wh  CompactBuffer(4 胡盼盼 18, 5 周安动 18)

//    * bd-sz  CompactBuffer(3 张熹 24, 4 张震 18)

//    * bd-bj  CompactBuffer(1 刘天天 22, 2 常龙 25)

//    * =============

//    * bd-bj  CompactBuffer(1 刘天天 22 bd-bj, 2 常龙 25 bd-bj)

//    * bd-sz  CompactBuffer(3 张熹 24 bd-sz, 4 张震 18 bd-sz)

//    * bd-wh  CompactBuffer(4 胡盼盼 18 bd-wh, 5 周安动 18 bd-wh)

//    **/

  def rdOps2(sc: SparkContext): Unit = {

val list =List(

"yan xin xin",

"zhang zhao ping"

    )

val listRDD: RDD[String] = sc.parallelize(list)

val wordsRDD = listRDD.flatMap(line => line.split("\\s+"))

val wordsMapRDD: RDD[(String, Int)] = wordsRDD.map((_,1))

val rbkRDD: RDD[(String, Int)] = wordsMapRDD.reduceByKey(_ + _)

rbkRDD.foreach(println)

/**

* (xin,2)

* (zhao,1)

* (zhang,1)

* (ping,1)

* (yan,1)

*/

  }

//distinct去重

  def distinctOps(sc: SparkContext): Unit = {

val list =List(

"蔡金廷",

"蔡金廷",

"蔡金廷",

"蔡金廷"

    )

val listRDD: RDD[String] = sc.parallelize(list)

//指定几个分区进行去重/也就是运行几个task

    listRDD.distinct(4).foreach(println)

}

//自定义的combineByKey模拟reducervyKey和groupByKey

  def cbk2Ops(sc: SparkContext) {

val list =List(

"hello you hello me",

"hello you shit me",

"hello you oh shit",

"me you hello me"

    )

val listRDD: RDD[String] = sc.parallelize(list)

val pairsRDD: RDD[(String, Int)] = listRDD.flatMap(_.split("\\s+")).map((_,1))

//传统的reducerBYkey

    pairsRDD.reduceByKey(_ + _).foreach(println)

//模拟实现

// pairsRDD.combineByKey(createCombiner,mergeValue,mergeCombiners).foreach(println)

    pairsRDD.combineByKey((num: Int) => num, (sum1: Int, num: Int) => sum1 + num, (sum: Int, sum1: Int) => sum + sum1)

sc.stop()

}

//初始化聚合结果类型

  def createCombiner(value: Int): Int = {

value

}

//分区间的聚合操作

  def mergeCombiners(sum1: Int, sum2: Int): Int = {

sum1 + sum2

}

//分区内的聚合操作

  def mergeValue(sum: Int, value: Int): Int = {

sum + value

}

//模拟实现groupByKey

  def gbk2Ops(sc: SparkContext) {

val stu =List(

"1 李飞 22 bd-1901-bj",

"2 常龙 25 bd-1901-bj",

"8 汪锦奇 24 bd-1901-sz",

"3 张熹 24 bd-1901-sz",

"4 胡盼盼 18 bd-1901-wh",

"6 陈抒 25 bd-1901-bj",

"7 曹煜 25 bd-1901-bj",

"5 周联动 18 bd-1901-wh",

"4 张震 18 bd-1901-sz"

    )

val stuRDD: RDD[String] = sc.parallelize(stu)

val class2Info: RDD[(String,String)] = stuRDD.map(stuLine => {

val id = stuLine.substring(stuLine.indexOf("bd"))

(id, stuLine)

})

val gbkRDD: RDD[(String,Iterable[String])] = class2Info.groupByKey()

//    gbkRDD.foreach { case (cl, info) => {

//      println(s"${cl}\t${info}")

//    }

//    }

//模拟实现

    class2Info.combineByKey(

(info:String) =>createCombiner1(info), (buffer: ArrayBuffer[String], info:String) =>

mergeValue1(buffer, info), (buffer1: ArrayBuffer[String], buffer2: ArrayBuffer[String]) =>

mergeCombiners1(buffer1, buffer2),new MyParationer(3))//该分区动作不是combineByKey数据输入进来是对数据做分区,经过combineByKey计算完毕之后将结果做分区

                  .saveAsTextFile("file:///D:/a")

sc.stop()

}

/*

初始化操作,确定聚合操作之后的结果类型

在每一个分区内相同的key,需要调用一次该操作,并将其中的一个元素用于初始化操作

*/

    def createCombiner1(str:String): ArrayBuffer[String] = {

val ab = ArrayBuffer[String]()

ab.append(str)

ab

}

/**

* 分区内的相同key的聚合操作

*

*/

    def mergeValue1(strings: ArrayBuffer[String], str:String): ArrayBuffer[String] = {

strings.append(str)

strings

}

/**

* 分区间的相同key的聚合操作

*/

    def mergeCombiners1(strings: ArrayBuffer[String], strings1: ArrayBuffer[String]): ArrayBuffer[String] = {

strings ++ strings1

}

//aggregateByKey模拟groupByKey

  def Agbk2Ops(sc:SparkContext): Unit = {

val stu =List(

"1 李飞 22 bd-1901-bj",

"2 常龙 25 bd-1901-bj",

"8 汪锦奇 24 bd-1901-sz",

"3 张熹 24 bd-1901-sz",

"4 胡盼盼 18 bd-1901-wh",

"6 陈抒 25 bd-1901-bj",

"7 曹煜 25 bd-1901-bj",

"5 周联动 18 bd-1901-wh",

"4 张震 18 bd-1901-sz"

    )

val stuRDD: RDD[String] = sc.parallelize(stu)

val clazz2Info: RDD[(String,String)] = stuRDD.map(stuLine => {

val clazz = stuLine.substring(stuLine.indexOf("bd"))

(clazz, stuLine)

})

val gbkRDD: RDD[(String,Iterable[String])] = clazz2Info.groupByKey()

gbkRDD.foreach {case (clazz, infos) => {

println(s"${clazz} ---> ${infos}")

}

}

println("======================")

val abk=clazz2Info.aggregateByKey(ArrayBuffer[String]())(seqOP,combOp)

abk.foreach(println)

}

def  seqOP(ab:ArrayBuffer[String],info:String):ArrayBuffer[String]={

ab.append(info)

ab

}

def combOp(ab1:ArrayBuffer[String],ab2:ArrayBuffer[String]):ArrayBuffer[String]={

ab1++ab2

}

//用aggregateByKey模拟reducerBykey

def agby2Ops(sc:SparkContext): Unit ={

val list =List(

"hello you hello me",

"hello you shit me",

"hello you oh shit",

"me you hello me"

  )

val listRDD:RDD[String]=sc.parallelize(list)

val pairsRDD:RDD[(String,Int)]=listRDD.flatMap(_.split("\\s+")).map((_,1))

pairsRDD.reduceByKey(_+_).foreach(println)

println("=============")

//模拟实现

  pairsRDD.aggregateByKey(0)(seqOp1,combOp1).foreach(println)

}

def seqOp1(sum:Int,a:Int):Int={

sum+a

}

def combOp1(sum1:Int,sum2:Int):Int ={

sum1+sum2

}

}

//分区

class MyParationer(partitions: Int)extends Partitioner {

override def numPartitions: Int = partitions

override def getPartition(key: Any) = {

val ret = keymatch {

case clazz:String => {

val city = clazz.substring(clazz.lastIndexOf("-" )+1)

citymatch {

case "wh" =>0

          case "bj" =>1

          case "sz" =>2

        }

}

case _ =>0

    }

ret

}

}

/**

* val wordCountsWithReduce = wordPairsRDD

* .reduceByKey(_ + _)

* .collect()

*

* val wordCountsWithGroup = wordPairsRDD

* .groupByKey()

* .map(t => (t._1, t._2.sum))

* .collect()

*

*/

你可能感兴趣的:(spark的groupByKey、reduceByKey的算子简单使用)