Spark-之自定义wordCount累加器

Spark-之自定义wordCount累加器

SparkCore中的3种数据类型:

  • 累加器(只写)
  • RDD
  • 广播变量(只读)

累加器在多个action算子触发的job中重复累加,且需要action算子才能触发累加器操作。

package com.shufang.acc

import com.shufang.utils.ScUtil
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.util.{AccumulatorV2, LongAccumulator}

import scala.collection.mutable

object AccumulatorDemo001 {
  def main(args: Array[String]): Unit = {
    val sc: SparkContext = ScUtil.getSc

    // 通常累加器需要通过action算子触发,多个action算子可能会造成重复累加
    /*val longAcc: LongAccumulator = sc.longAccumulator("longAcc")

    sc.makeRDD(1 to 5)
      .foreach{
        longAcc.add(1L)
        println(_)
      }


    println(longAcc.value)*/
    val myAcc = new AccumulatorDemo001
    sc.register(myAcc,"udfAcc");

    val rdd: RDD[String] = sc.makeRDD(List("a", "a", "b", "a", "c"))

    rdd.foreach {
      case word =>{
        myAcc.add(word)
        println(word)

      }
    }


    println(myAcc.value)
    sc.stop()
  }
}


/**
 * 自定义累加器,实现wordCount
 */
class AccumulatorDemo001 extends AccumulatorV2[String,mutable.Map[String,Int]]{

  private var map: mutable.Map[String, Int] = mutable.Map[String, Int]()
  //判断是否为初始化的累加器,如果map为空,表示为初始化
  override def isZero: Boolean = {
    map.isEmpty
  }

  //拷贝
  override def copy(): AccumulatorV2[String, mutable.Map[String, Int]] = {
    new AccumulatorDemo001
  }

  //重置
  override def reset(): Unit = {
    map.clear()
  }

  //将每个元素添加到累加器
  override def add(k: String): Unit = {
    val newVal: Int = map.getOrElse(k, 0) + 1
    map.update(k,newVal)
  }

  //在Driver端对从多个Executor收集过来的变量进行merge
  override def merge(other: AccumulatorV2[String, mutable.Map[String, Int]]): Unit = {
    val map1 = this.map
    val map2 = other.value

    // 将map2中的元素合并到map1中
    map2.foreach {
      case(word,count) => {
        val newVal: Int = map1.getOrElse(word,0) + count
        map1.update(word,newVal)
      }
    }
  }

  // 获取累加器的值
  override def value: mutable.Map[String, Int] = this.map
}

你可能感兴趣的:(Spark,spark,scala,big,data)