Spark的Driver和Executor端代码划分

1.Driver、Executor端区分
package sparkStream

import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}

object DriverAndExecutorCode {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[*]")
      .getOrCreate()
    val sc = spark.sparkContext
    val ssc = new StreamingContext(sc, Seconds(5))

    val inputDstream: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88", 6666)

    //TODO 代码1 --> Driver端执行 1次
    inputDstream.transform {
      rdd => {
        //TODO 代码2 --> Driver端执行 (每采集一次数据,执行一次。如:从redis读取黑名单可以在这里处理)
        rdd.map {
          case line => {
            //TODO 代码3 --> Executor端执行 (不同的Executor都会执行这段代码)
          }
        }
      }
    }

    inputDstream.map {
      case line => {
        //TODO 代码4 --> Executor端执行 (不同的Executor都会执行这段代码)
      }
    }

    /**
      * 数据库连接位置选择:
      * 1.根据闭包原则,Driver端对象需要序列化才能通过网络传入到Executor端,
      *   由于连接不能被序列化,所以不能在Driver端创建
      * 2.在foreach中创建,每一条数据都会创建一个连接,造成资源浪费
      * 3.使用foreachPartition算子,而RDD算子中的函数是在Executor端执行的,
      *  所以在分区中创建较好。
      **/
    inputDstream.foreachRDD {
      rdd => {
        //TODO Driver端执行,数据库连接不能被序列化,因此不能在这里创建
        rdd.foreachPartition {
          iter => {
            //TODO  这里创建数据库连接,如redis、mysql等
            iter.foreach {
              x => {
                //TODO 写入数据库
              }
            }
            //TODO 关闭连接
          }
        }
      }
    }

    ssc.start()
    ssc.awaitTermination()
  }
}
2.使用kryo序列化
package sparkCore

import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object KryoDemo {
  def main(args: Array[String]): Unit = {
    //        val spark = SparkSession
    //          .builder()
    //          .appName(this.getClass.getName)
    //          .master("local[2]")
    //          .config("spark.serializer",classOf[JavaSerializer].getName) //注册Kryo序列化器,可以不写,源码已默认设置
    //          .config("spark.kryo.classesToRegister","Fiction") //指定需要序列化的类,多个逗号分隔
    //          .getOrCreate()
    //
    //        val sc = spark.sparkContext

    val conf = new SparkConf()
      .setAppName(this.getClass.getName)
      .setMaster("local[2]")
      .set("spark.serializer", classOf[KryoSerializer].getName) 注册Kryo序列化器,可以不写,源码已默认设置
      .registerKryoClasses(Array(classOf[Author])) 指定需要序列化的类

    val sc = new SparkContext(conf)

    val lineRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\booksInfo")

    val author = new Author

    val fictionRDD = lineRDD.map {
      line =>
        val arr = line.split(" ")
        //闭包应用,Executor中每个task任务都会创建一个不同的实例,如果改成广播变量,则每个Executor中所有的task任务共享一个实例
        val info: mutable.Map[String, Int] = author.getAuthorInfo //闭包,引入外界变量需要序列化
        val age = info(arr(1))
        (arr(0),arr(1),age)
    }

    println(fictionRDD.collect.toBuffer)

  }
}

case class Fiction (name: String, author: String){

  override def toString: String = s"书名:$name|作者:$author"
}

class Author extends Serializable {
  import scala.collection.mutable
  private val authorInfo: mutable.Map[String,Int] = mutable.Map(("金庸",999),("古龙",888))
  def alterAuthorInfo(name: String, age: Int): Unit = {
    if (authorInfo.contains(name)) {
      authorInfo(name) = age
    } else {
      authorInfo += name -> age
    }
  }

  def getAuthorInfo: mutable.Map[String, Int] = authorInfo
}

你可能感兴趣的:(sparkCore,scala,spark)