Maven依赖
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql-kafka-0-10_2.12artifactId>
<version>3.3.1version>
dependency>
/**
* 模拟产生基站日志数据,实时发送Kafka Topic中
* 数据字段信息:
* 基站标识符ID, 主叫号码, 被叫号码, 通话状态, 通话时间,通话时长
*/
object MockStationLog {
def main(args: Array[String]): Unit = {
// 发送Kafka Topic
val props = new Properties()
props.put("bootstrap.servers", "hadoop102:9092")
props.put("acks", "1")
props.put("retries", "3")
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
val producer = new KafkaProducer[String, String](props)
val random = new Random()
val allStatus = Array(
"fail", "busy", "barring", "success", "success", "success",
"success", "success", "success", "success", "success", "success"
)
while (true) {
val callOut: String = "1860000%04d".format(random.nextInt(10000))
val callIn: String = "1890000%04d".format(random.nextInt(10000))
val callStatus: String = allStatus(random.nextInt(allStatus.length))
val callDuration = if ("success".equals(callStatus)) (1 + random.nextInt(10)) * 1000L else 0L
// 随机产生一条基站日志数据
val stationLog: StationLog = StationLog(
"station_" + random.nextInt(10),
callOut,
callIn,
callStatus,
System.currentTimeMillis(),
callDuration
)
println(stationLog.toString)
Thread.sleep(100 + random.nextInt(100))
val record = new ProducerRecord[String, String]("stationTopic", stationLog.toString)
producer.send(record)
}
producer.close() // 关闭连接
}
/**
* 基站通话日志数据
*/
case class StationLog(
stationId: String, //基站标识符ID
callOut: String, //主叫号码
callIn: String, //被叫号码
callStatus: String, //通话状态
callTime: Long, //通话时间
duration: Long //通话时长
) {
override def toString: String = {
s"$stationId,$callOut,$callIn,$callStatus,$callTime,$duration"
}
}
}
#查看topic信息
/export/server/kafka/bin/kafka-topics.sh --list --zookeeper hadoop102:2181
#删除topic
/export/server/kafka/bin/kafka-topics.sh --delete --zookeeper hadoop102:2181 --topic stationTopic
/export/server/kafka/bin/kafka-topics.sh --delete --zookeeper hadoop102:2181 --topic etlTopic
#创建topic
/export/server/kafka/bin/kafka-topics.sh --create --zookeeper hadoop102:2181 --replication-factor 1 --partitions 3 --topic stationTopic
/export/server/kafka/bin/kafka-topics.sh --create --zookeeper hadoop102:2181 --replication-factor 1 --partitions 3 --topic etlTopic
#模拟生产者
/export/server/kafka/bin/kafka-console-producer.sh --broker-list hadoop102:9092 --topic stationTopic
/export/server/kafka/bin/kafka-console-producer.sh --broker-list hadoop102:9092 --topic etlTopic
#模拟消费者
/export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic stationTopic --from-beginning
/export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic etlTopic --from-beginning
数据流向:stationTopic -> structuredSreaming -> etlTopic
object ETLTest {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("ETLName").master("local[]*")
.config("saprk.sql.shuffle.partitions", "4")
.getOrCreate()
val sc = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
// 加载数据
val df = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "hadoop102:9092")
.option("subscribe", "stationTopic")
.load()
val ds = df.selectExpr("CAST(value AS STRING)").as[String]
// 处理数据ETL
val result = ds.filter(_.contains("success"))
// 结果输出
result.writeStream
.format("kafka")
.option("kafka.bootstrap.servers", "hadoop102:9092")
.option("topic", "etlTopic")
.option("checkpointLocation", "./ckp")
//启动并等待结束
.start()
.awaitTermination()
spark.stop()
}
}
object MockIotDatas {
def main(args: Array[String]): Unit = {
// 发送Kafka Topic
val props = new Properties()
props.put("bootstrap.servers", "hadoop102:9092")
props.put("acks", "1")
props.put("retries", "3")
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
val producer = new KafkaProducer[String, String](props)
val deviceTypes = Array(
"db", "bigdata", "kafka", "route", "bigdata", "db", "bigdata", "bigdata", "bigdata"
)
val random: Random = new Random()
while (true) {
val index: Int = random.nextInt(deviceTypes.length)
val deviceId: String = s"device_${(index + 1) * 10 + random.nextInt(index + 1)}"
val deviceType: String = deviceTypes(index)
val deviceSignal: Int = 10 + random.nextInt(90)
// 模拟构造设备数据
val deviceData = DeviceData(deviceId, deviceType, deviceSignal, System.currentTimeMillis())
// 转换为JSON字符串
val deviceJson: String = new Json(org.json4s.DefaultFormats).write(deviceData)
println(deviceJson)
Thread.sleep(100 + random.nextInt(500))
val record = new ProducerRecord[String, String]("iotTopic", deviceJson)
producer.send(record)
}
// 关闭连接
producer.close()
}
/**
* 物联网设备发送状态数据
*/
case class DeviceData(
device: String, //设备标识符ID
deviceType: String, //设备类型,如服务器mysql, redis, kafka或路由器route
signal: Double, //设备信号
time: Long //发送数据时间
)
}
数据流向:iotTopic —>StructuredStreaming–>控制台
object SS_Kf {
def main(args: Array[String]): Unit = {
// 创建环境
val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions", "4")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
import org.apache.spark.sql.functions._
//加载数据
val kafkaDF: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "hadoop102:9092")
.option("subscribe", "iotTopic")
.load()
val valueDS: Dataset[String] = kafkaDF.selectExpr("CAST(value AS STRING)").as[String]
// 数据为josn格式,需要进行解析
//{"device":"device_30","deviceType":"kafka","signal":77.0,"time":1610158709534}
//处理数据
val schemaDF: DataFrame = valueDS.filter(StringUtils.isNotBlank(_))
.select(
get_json_object($"value", "$.device").as("device_id"),
get_json_object($"value", "$.deviceType").as("deviceType"),
get_json_object($"value", "$.signal").cast(DoubleType).as("signal")
)
// SQL
schemaDF.createOrReplaceTempView("t_iot")
val sql: String =
"""
|select deviceType,count(*) as counts,avg(signal) as avgsignal
|from t_iot
|where signal > 30
|group by deviceType
|""".stripMargin
val result1: DataFrame = spark.sql(sql)
//DSL
val result2: DataFrame = schemaDF.filter('signal > 30)
.groupBy('deviceType)
.agg(
count('device_id) as "counts",
avg('signal) as "avgsignal"
)
//TODO 3.输出结果-控制台
result1.writeStream
.format("console")
.outputMode("complete")
.start()
//TODO 4.启动并等待结束
result2.writeStream
.format("console")
.outputMode("complete")
.start()
.awaitTermination()
//TODO 5.关闭资源
spark.stop()
}
}
每隔5s计算最近10s的数据
数据可能会延迟到达,比如1:20的数据1:30才到达
基于事件时间进行窗口计算+ Watermaker水位线/水印解决数据延迟到达问题
import spark.implicits._
val words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String }
// Group the data by window and word and compute the count of each group
val windowedCounts = words
.withWatermark("timestamp", "10 minutes")
.groupBy(
window($"timestamp", "10 minutes", "5 minutes"),
$"word")
.count()
object WindowTest {
def main(args: Array[String]): Unit = {
//创建环境
val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions", "4")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import org.apache.spark.sql.functions._
import spark.implicits._
//加载数据
val socketDF: DataFrame = spark.readStream
.format("socket")
.option("host", "hadoop102")
.option("port", 9999)
.load()
//处理数据
val wordDF = socketDF
.as[String]
.filter(StringUtils.isNotBlank(_))
// 将每行数据进行分割: 2019-10-12 09:00:02,cat
.map(line => {
val arr = line.trim.split(",")
val timestampStr: String = arr(0)
val word: String = arr(1)
(Timestamp.valueOf(timestampStr), word)
})
// 设置列的名称
.toDF("timestamp", "word")
//每隔5s计算最近10s的数据,withWatermark设置为10s
val resultDF = wordDF
//withWatermark(指定事件时间是哪一列,指定时间阈值)
.withWatermark("timestamp", "10 seconds")
.groupBy(
//指定基于事件时间做窗口聚合计算:WordCount
//window(指定事件时间是哪一列,窗口长度,滑动间隔)
window($"timestamp", "10 seconds", "5 seconds"),
$"word")
.count()
//TODO 3.输出结果-控制台
resultDF.writeStream
.outputMode(OutputMode.Update()) //为了方便观察只输出有变化的数据
.format("console")
.option("truncate", "false")
.trigger(Trigger.ProcessingTime("5 seconds"))
//TODO 4.启动并等待结束
.start()
.awaitTermination()
spark.stop()
//TODO 5.关闭资源
spark.stop()
}
}