# 启动zookeeper
bin/zkServer.sh start
# 启动kafka
bin/kafka-server-start.sh -daemon config/server.properties
bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic stationTopic
bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic etlTopic
写入kafka 的stationTopic主题
package com.jiang.structed_streaming_kafka
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import scala.util.Random
object MockStationLog {
def main(args: Array[String]): Unit = {
// 发送 kafka topic
val props = new Properties()
props.put("bootstrap.servers", "192.168.89.15:9092")
props.put("acks", "1")
props.put("retries", "3")
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
val producer = new KafkaProducer[String, String](props)
val random = new Random()
val allStatus = Array(
"fail", "busy", "barring", "success", "success", "success",
"success", "success", "success", "success", "success", "success"
)
while (true) {
val callOut: String = "1860000%04d".format(random.nextInt(10000))
val callIn: String = "1890000%04d".format(random.nextInt(10000))
val callStatus: String = allStatus(random.nextInt(allStatus.length))
val callDuration = if ("success".equals(callStatus)) (1 + random.nextInt(10)) * 1000L else 0L
// 随机产生一条基站日志数据
val stationLog: StationLog = StationLog(
"station_" + random.nextInt(10),
callOut,
callIn,
callStatus,
System.currentTimeMillis(),
callDuration
)
println(stationLog.toString)
Thread.sleep(100 + random.nextInt(100))
val record = new ProducerRecord[String, String]("stationTopic", stationLog.toString)
producer.send(record)
}
producer.close() // 关闭连接
}
/*
*
* @Description: 基站通话日志数据
*/
case class StationLog(
stationId:String, // 基站标识符
callOut:String, // 主叫号码
callIn:String, // 被叫号码
callStatus:String, // 通话状态
callTime:Long, // 通话时间
duration:Long, // 通话时长
){
override def toString(): String = {
s"$stationId,$callOut,$callIn,$callStatus,$callTime,$duration"
}
}
}
获取stationTopic中的success数据
package com.jiang.StructedStreaming
import org.apache.spark.SparkContext
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
/*
* @param null
*
* @Description: Structured Streaming整合Kafka
* 从stationTopic消费数据---->使用StructuredStreaming进行ETL--->将ETL的结果写入etlTopic主题
*/
object Kafka_ETL {
def main(args: Array[String]): Unit = {
//TODO 0.创建环境
val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions","1")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
//TODO 1.加载数据 -- kafka的stationTopic主题
val kafkaDF: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.89.10:9092")
.option("subscribe", "stationTopic")
.load()
val ds: Dataset[String] = kafkaDF.selectExpr("CAST(value AS STRING)").as[String]
//TODO 2.处理数据 -- ETL-->过滤success的数据
val etlResult: Dataset[String] = ds.filter(_.contains("success"))
//TODO 3.输出结果 --kafka的etlTopic主题
etlResult.writeStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.89.10:9092")
.option("topic", "etlTopic")
.option("checkpointLocation","./ckp")
.start()
.awaitTermination()
//TODO 4.启动并等待结果
//TODO 5.关闭资源
spark.stop()
}
}
bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic iotTopic
(1)模拟数据的生成代码(数据发送到kafka的iotTopic主题)
package com.jiang.structed_streaming_kafka
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.json4s.jackson.Json
import scala.util.Random
object MockIotDatas {
def main(args: Array[String]): Unit = {
// 发送 kafka topic
val props = new Properties()
props.put("bootstrap.servers", "192.168.89.10:9092")
props.put("acks", "1")
props.put("retries", "3")
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
val producer = new KafkaProducer[String, String](props)
val random = new Random()
val deviceTypes = Array(
"db", "bigdata", "kafka", "route", "bigdata", "db", "bigdata", "bigdata", "bigdata"
)
while (true) {
val index: Int = random.nextInt(deviceTypes.length)
val deviceId: String = s"device_${(index + 1) * 10 + random.nextInt(index + 1)}"
val deviceType: String = deviceTypes(index)
val deviceSignal: Int = 10 + random.nextInt(90)
// 模拟构造设备数据
val deviceData = DeviceData(deviceId, deviceType, deviceSignal, System.currentTimeMillis())
// 转换为JSON字符串
val deviceJson: String = new Json(org.json4s.DefaultFormats).write(deviceData)
//{"device":"device_30","deviceType":"kafka","signal":77.0,"time":1610158709534}
println(deviceJson)
Thread.sleep(1000 + random.nextInt(500))
val record = new ProducerRecord[String, String]("iotTopic", deviceJson)
producer.send(record)
}
producer.close() // 关闭连接
}
/**
* 物联网设备发送状态数据
*/
case class DeviceData(
device: String, //设备标识符ID
deviceType: String, //设备类型,如服务器mysql, redis, kafka或路由器route
signal: Double, //设备信号
time: Long //发送数据时间
)
}
从kafka的iotTopic主题消费数据 -->使用StructuredStreaming进行实时分析–>将结果写到控制台
package com.jiang.structed_streaming_kafka
import org.apache.commons.lang3.StringUtils
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* Author Jiang
* Desc 演示StructuredStreaming整合Kafka,
* 从iotTopic消费数据 -->使用StructuredStreaming进行实时分析-->将结果写到控制台
*/
object Kafka_IOT {
def main(args: Array[String]): Unit = {
//TODO 0.创建环境
//因为StructuredStreaming基于SparkSQL的且编程API/数据抽象是DataFrame/DataSet,所以这里创建SparkSession即可
val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions", "1") //本次测试时将分区数设置小一点,实际开发中可以根据集群规模调整大小,默认200
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
import org.apache.spark.sql.functions._
//TODO 1.加载数据 (加载kafka主题iotTopic中的数据)
val kafkaDF: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.89.10:9092")
.option("subscribe", "iotTopic")
.load()
val valueDS: Dataset[String] = kafkaDF.selectExpr("CAST(value AS STRING)").as[String]
//{"device":"device_30","deviceType":"kafka","signal":77.0,"time":1610158709534}
//TODO 2.处理数据
//需求:统计信号强度>30的各种设备类型对应的数量和平均信号强度
//解析json(也就是增加schema:字段名和类型)
//方式1:fastJson/Gson等工具包,后续案例中使用
//方式2:使用SparkSQL的内置函数,当前案例使用
val schemaDF: DataFrame = valueDS.filter(StringUtils.isNotBlank(_))
.select(
get_json_object($"value", "$.device").as("device_id"),
get_json_object($"value", "$.deviceType").as("deviceType"),
get_json_object($"value", "$.signal").cast(DoubleType).as("signal")
)
//需求:统计信号强度>30的各种设备类型对应的数量和平均信号强度
//TODO ====SQL
schemaDF.createOrReplaceTempView("t_iot")
val sql: String =
"""
|select deviceType,count(*) as counts,avg(signal) as avgsignal
|from t_iot
|where signal > 30
|group by deviceType
|""".stripMargin
val result1: DataFrame = spark.sql(sql)
//TODO ====DSL
val result2: DataFrame = schemaDF.filter('signal > 30)
.groupBy('deviceType)
.agg(
count('device_id) as "counts",
avg('signal) as "avgsignal"
)
//TODO 3.输出结果-控制台
result1.writeStream
.format("console")
.outputMode("complete")
//.option("truncate", false)
.start()
//.awaitTermination()
//TODO 4.启动并等待结束
result2.writeStream
.format("console")
.outputMode("complete")
//.trigger(Trigger.ProcessingTime(0))
//.option("truncate", false)
.start()
.awaitTermination()
//TODO 5.关闭资源
spark.stop()
}
}