StructStreaming整合Kafka操作

StructStreaming整合Kafka操作

Maven依赖

<dependency>
    <groupId>org.apache.sparkgroupId>
    <artifactId>spark-sql-kafka-0-10_2.12artifactId>
    <version>3.3.1version>
dependency>

实时数据ETL

StructStreaming整合Kafka操作_第1张图片

准备数据

/**
 * 模拟产生基站日志数据,实时发送Kafka Topic中
 * 数据字段信息:
 * 基站标识符ID, 主叫号码, 被叫号码, 通话状态, 通话时间,通话时长
 */
object MockStationLog {
  def main(args: Array[String]): Unit = {
    // 发送Kafka Topic
    val props = new Properties()
    props.put("bootstrap.servers", "hadoop102:9092")
    props.put("acks", "1")
    props.put("retries", "3")
    props.put("key.serializer", classOf[StringSerializer].getName)
    props.put("value.serializer", classOf[StringSerializer].getName)
    val producer = new KafkaProducer[String, String](props)

    val random = new Random()
    val allStatus = Array(
      "fail", "busy", "barring", "success", "success", "success",
      "success", "success", "success", "success", "success", "success"
    )

    while (true) {
      val callOut: String = "1860000%04d".format(random.nextInt(10000))
      val callIn: String = "1890000%04d".format(random.nextInt(10000))
      val callStatus: String = allStatus(random.nextInt(allStatus.length))
      val callDuration = if ("success".equals(callStatus)) (1 + random.nextInt(10)) * 1000L else 0L

      // 随机产生一条基站日志数据
      val stationLog: StationLog = StationLog(
        "station_" + random.nextInt(10),
        callOut,
        callIn,
        callStatus,
        System.currentTimeMillis(),
        callDuration
      )
      println(stationLog.toString)
      Thread.sleep(100 + random.nextInt(100))

      val record = new ProducerRecord[String, String]("stationTopic", stationLog.toString)
      producer.send(record)
    }

    producer.close() // 关闭连接
  }

  /**
   * 基站通话日志数据
   */
  case class StationLog(
                         stationId: String, //基站标识符ID
                         callOut: String, //主叫号码
                         callIn: String, //被叫号码
                         callStatus: String, //通话状态
                         callTime: Long, //通话时间
                         duration: Long //通话时长
                       ) {
    override def toString: String = {
      s"$stationId,$callOut,$callIn,$callStatus,$callTime,$duration"
    }
  }

}

准备主题

#查看topic信息
/export/server/kafka/bin/kafka-topics.sh --list --zookeeper hadoop102:2181
#删除topic
/export/server/kafka/bin/kafka-topics.sh --delete --zookeeper hadoop102:2181 --topic stationTopic
/export/server/kafka/bin/kafka-topics.sh --delete --zookeeper hadoop102:2181 --topic etlTopic

#创建topic
/export/server/kafka/bin/kafka-topics.sh --create --zookeeper hadoop102:2181 --replication-factor 1 --partitions 3 --topic stationTopic
/export/server/kafka/bin/kafka-topics.sh --create --zookeeper hadoop102:2181 --replication-factor 1 --partitions 3 --topic etlTopic

#模拟生产者
/export/server/kafka/bin/kafka-console-producer.sh --broker-list hadoop102:9092 --topic stationTopic
/export/server/kafka/bin/kafka-console-producer.sh --broker-list hadoop102:9092 --topic etlTopic

#模拟消费者
/export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic stationTopic --from-beginning
/export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic etlTopic --from-beginning

代码实现

数据流向:stationTopic -> structuredSreaming -> etlTopic

object ETLTest {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().appName("ETLName").master("local[]*")
      .config("saprk.sql.shuffle.partitions", "4")
      .getOrCreate()
    val sc = spark.sparkContext
    sc.setLogLevel("WARN")
    import spark.implicits._

    // 加载数据
    val df = spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "hadoop102:9092")
      .option("subscribe", "stationTopic")
      .load()
    val ds = df.selectExpr("CAST(value AS STRING)").as[String]

    // 处理数据ETL
    val result = ds.filter(_.contains("success"))

    // 结果输出
    result.writeStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "hadoop102:9092")
      .option("topic", "etlTopic")
      .option("checkpointLocation", "./ckp")
      //启动并等待结束
      .start()
      .awaitTermination()

    spark.stop()
  }

}

模拟物联网设备数据实时分析

StructStreaming整合Kafka操作_第2张图片

准备数据

object MockIotDatas {
  def main(args: Array[String]): Unit = {
    // 发送Kafka Topic
    val props = new Properties()
    props.put("bootstrap.servers", "hadoop102:9092")
    props.put("acks", "1")
    props.put("retries", "3")
    props.put("key.serializer", classOf[StringSerializer].getName)
    props.put("value.serializer", classOf[StringSerializer].getName)
    val producer = new KafkaProducer[String, String](props)

    val deviceTypes = Array(
      "db", "bigdata", "kafka", "route", "bigdata", "db", "bigdata", "bigdata", "bigdata"
    )

    val random: Random = new Random()
    while (true) {
      val index: Int = random.nextInt(deviceTypes.length)
      val deviceId: String = s"device_${(index + 1) * 10 + random.nextInt(index + 1)}"
      val deviceType: String = deviceTypes(index)
      val deviceSignal: Int = 10 + random.nextInt(90)
      // 模拟构造设备数据
      val deviceData = DeviceData(deviceId, deviceType, deviceSignal, System.currentTimeMillis())
      // 转换为JSON字符串
      val deviceJson: String = new Json(org.json4s.DefaultFormats).write(deviceData)
      println(deviceJson)
      Thread.sleep(100 + random.nextInt(500))

      val record = new ProducerRecord[String, String]("iotTopic", deviceJson)
      producer.send(record)
    }

    // 关闭连接
    producer.close()
  }

  /**
   * 物联网设备发送状态数据
   */
  case class DeviceData(
                         device: String, //设备标识符ID
                         deviceType: String, //设备类型,如服务器mysql, redis, kafka或路由器route
                         signal: Double, //设备信号
                         time: Long //发送数据时间
                       )

}

代码实现

数据流向:iotTopic —>StructuredStreaming–>控制台

object SS_Kf {
  def main(args: Array[String]): Unit = {
    // 创建环境
    val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
      .config("spark.sql.shuffle.partitions", "4") 
      .getOrCreate()
    val sc: SparkContext = spark.sparkContext
      
    sc.setLogLevel("WARN")
    import spark.implicits._
    import org.apache.spark.sql.functions._

    //加载数据
    val kafkaDF: DataFrame = spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "hadoop102:9092")
      .option("subscribe", "iotTopic")
      .load()
    val valueDS: Dataset[String] = kafkaDF.selectExpr("CAST(value AS STRING)").as[String]

    // 数据为josn格式,需要进行解析
    //{"device":"device_30","deviceType":"kafka","signal":77.0,"time":1610158709534}

    //处理数据
    val schemaDF: DataFrame = valueDS.filter(StringUtils.isNotBlank(_))
      .select(
        get_json_object($"value", "$.device").as("device_id"),
        get_json_object($"value", "$.deviceType").as("deviceType"),
        get_json_object($"value", "$.signal").cast(DoubleType).as("signal")
      )
    // SQL
    schemaDF.createOrReplaceTempView("t_iot")
    val sql: String =
      """
        |select deviceType,count(*) as counts,avg(signal) as avgsignal
        |from t_iot
        |where signal > 30
        |group by deviceType
        |""".stripMargin
    val result1: DataFrame = spark.sql(sql)

    //DSL
    val result2: DataFrame = schemaDF.filter('signal > 30)
      .groupBy('deviceType)
      .agg(
        count('device_id) as "counts",
        avg('signal) as "avgsignal"
      )


    //TODO 3.输出结果-控制台
    result1.writeStream
      .format("console")
      .outputMode("complete")
      .start()

    //TODO 4.启动并等待结束
    result2.writeStream
      .format("console")
      .outputMode("complete")
      .start()
      .awaitTermination()

    //TODO 5.关闭资源
    spark.stop()
  }
}

窗口计算

每隔5s计算最近10s的数据

数据可能会延迟到达,比如1:20的数据1:30才到达

基于事件时间进行窗口计算+ Watermaker水位线/水印解决数据延迟到达问题

import spark.implicits._

val words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String }

// Group the data by window and word and compute the count of each group
val windowedCounts = words
    .withWatermark("timestamp", "10 minutes")
    .groupBy(
        window($"timestamp", "10 minutes", "5 minutes"),
        $"word")
    .count()
object WindowTest {
  def main(args: Array[String]): Unit = {
    //创建环境
    val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
      .config("spark.sql.shuffle.partitions", "4")
      .getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")
    import org.apache.spark.sql.functions._
    import spark.implicits._

    //加载数据
    val socketDF: DataFrame = spark.readStream
      .format("socket")
      .option("host", "hadoop102")
      .option("port", 9999)
      .load()

    //处理数据
    val wordDF = socketDF
      .as[String]
      .filter(StringUtils.isNotBlank(_))
      // 将每行数据进行分割: 2019-10-12 09:00:02,cat
      .map(line => {
        val arr = line.trim.split(",")
        val timestampStr: String = arr(0)
        val word: String = arr(1)
        (Timestamp.valueOf(timestampStr), word)
      })
      // 设置列的名称
      .toDF("timestamp", "word")
    //每隔5s计算最近10s的数据,withWatermark设置为10s
    val resultDF = wordDF
      //withWatermark(指定事件时间是哪一列,指定时间阈值)
      .withWatermark("timestamp", "10 seconds")
      .groupBy(
        //指定基于事件时间做窗口聚合计算:WordCount
        //window(指定事件时间是哪一列,窗口长度,滑动间隔)
        window($"timestamp", "10 seconds", "5 seconds"),
        $"word")
      .count()


    //TODO 3.输出结果-控制台
    resultDF.writeStream
      .outputMode(OutputMode.Update()) //为了方便观察只输出有变化的数据
      .format("console")
      .option("truncate", "false")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      //TODO 4.启动并等待结束
      .start()
      .awaitTermination()

    spark.stop()

    //TODO 5.关闭资源
    spark.stop()
  }
}

你可能感兴趣的:(spark,kafka,scala,spark)