累加器是spark 提供的一种共享变量机制,在spark中每一个task会分配到不同的节点中,执行过程中如果将多台节点中数据累加到同一变量中,可以通过累加器实现该功能
这里只介绍spark2累加器,长整数累加器,双精度浮点数累加器,集合累加器,自定义累加器
主要用于对数字进行加减,只对数字进行增加操作,若进行减少,需要选择 将数字前加上负号
import org.apache.spark.{SparkConf, SparkContext}
object TestAccess {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestAccess")
val sc = new SparkContext(conf)
val visitorRDD = sc.parallelize(
Array(
("Bob", 15),
("Thomas", 28),
("Tom", 18),
("Galen", 35),
("Catalina", 12),
("Karen", 9),
("Boris", 20)),
3)
val accumulator = sc.longAccumulator("testAcc")
visitorRDD.foreach(t => {
if (t._2 >= 18){
accumulator.add(1)
}
})
println(accumulator.value)
}
}
结果
4
为什么需要使用到累加器,而不是变量直接var count=0 count+=1的形式
,因为foreach操作是在 worker端分配到Executor进程中被执行的,在Driver进程中定义的count变量并不会被累加,而在Executor进程中定义的遍历count才会累加
错误示范
import org.apache.spark.{SparkConf, SparkContext}
object TestAccess {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestAccess")
val sc = new SparkContext(conf)
val visitorRDD = sc.parallelize(
Array(
("Bob", 15),
("Thomas", 28),
("Tom", 18),
("Galen", 35),
("Catalina", 12),
("Karen", 9),
("Boris", 20)),
3)
val accumulator = sc.longAccumulator("testAcc")
val rdd2 = visitorRDD.map(t => {
if (t._2 >= 18) {
accumulator.add(1)
}
})
rdd2.count()
println(accumulator.value)
rdd2.count()
println(accumulator.value)
}
}
结果
4
8
第一次count 触发了job,累加器进行累加,第二次又进行了count,同时累加器又被触发,因此结果为8,避免这种结果方式是通过缓存
import org.apache.spark.{SparkConf, SparkContext}
object TestAccess {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestAccess")
val sc = new SparkContext(conf)
val visitorRDD = sc.parallelize(
Array(
("Bob", 15),
("Thomas", 28),
("Tom", 18),
("Galen", 35),
("Catalina", 12),
("Karen", 9),
("Boris", 20)),
3)
val accumulator = sc.longAccumulator("testAcc")
val rdd2 = visitorRDD.map(t => {
if (t._2 >= 18) {
accumulator.add(1)
}
})
rdd2.cache()
rdd2.count()
println(accumulator.value)
rdd2.count()
println(accumulator.value)
}
}
结果
4
4
import org.apache.spark.{SparkConf, SparkContext}
object TestAccList {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestAccList")
val sc = new SparkContext(conf)
val userArray = Array(User("Alice", "15837312345"),
User("Bob", "13937312666"),
User("Thomas", "13637312345"),
User("Tom", "18537312777"),
User("Boris", "13837312998"))
val userRDD = sc.parallelize(userArray, 2)
val userAccumulator = sc.collectionAccumulator[User]("集合累加器")
userRDD.foreach(user => {
val telPhone = user.phone.reverse
if (telPhone(0) == telPhone(1) && telPhone(0) == telPhone(2)) {
userAccumulator.add(user)
}
})
println(userAccumulator)
}
}
case class User(name: String, phone: String)
结果
CollectionAccumulator(id: 0, name: Some(集合累加器), value: [User(Bob,13937312666), User(Tom,18537312777)])
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.util.AccumulatorV2
/**
* 统计男性数,女性数,11岁以下,12-17岁人数
*/
object UserDefineAcc {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("UserDefineAcc")
val sc = new SparkContext(conf)
val userArray = Array(User("Alice", "Female", 11),
User("Bob", "Male", 16),
User("Thomas", "Male", 14),
User("Catalina", "Female", 20),
User("Boris", "Third", 12),
User("Karen", "Female", 9),
User("Tom", "Male", 7))
val userRDD = sc.parallelize(userArray, 2)
lazy val userAccumulator = new UserAccumulate[User]
sc.register(userAccumulator, "自定义用户累加器")
userRDD.foreach(userAccumulator.add)
println(userAccumulator)
}
}
case class User(name: String, sex: String, age: Int)
/**
*
* @tparam T 累加数据类型
* Array[Int] 累加后返回数据
*/
class UserAccumulate[T] extends AccumulatorV2[T, Array[Int]] {
/**
* 初始值
*/
private val _array: Array[Int] = Array(0, 0, 0, 0)
/**
* 判断累加器是否处于初始状态
*
* @return
*/
override def isZero: Boolean = _array.mkString("").toLong == 0L
/**
* 主要是实现累加器的复制,通过该方法复制出一个相同状态的累加器
* @return
*/
override def copy(): AccumulatorV2[T, Array[Int]] = {
val newAcc = new UserAccumulate[T]
_array.copyToArray(newAcc._array)
newAcc
}
/**
* 将累加器中结果全部归为初始化状态
*/
override def reset(): Unit = {
for (i <- _array.indices) {
_array(i) = 0
}
}
/**
* 将累加器中结果进行累加
* @param v
*/
override def add(v: T): Unit = {
val user = v.asInstanceOf[User]
if (user.sex == "Famale") {
_array(0) += 1
} else {
_array(1) += 1
}
if (user.age < 12) {
_array(2) += 1
} else if (user.age < 18) {
_array(3) += 1
}
}
/**
* 将不同分区中累加器进行合并
* @param other
*/
override def merge(other: AccumulatorV2[T, Array[Int]]): Unit = {
val o = other.asInstanceOf[UserAccumulate[T]]
_array(0) += o._array(0)
_array(1) += o._array(1)
_array(2) += o._array(2)
_array(3) += o._array(3)
}
/**
* 返回当前累加器结果
* @return
*/
override def value: Array[Int] = {
_array
}
override def toString(): String = {
getClass.getSimpleName + s"(id : $id , name $name ,value $value)"
}
}