需要自定义一个sink组件,其实很简单,只需要继承ForeachWriter,实现相关的方法即可。
class RedisSink extends ForeachWriterRow {
var jedisPool: JedisPool = _
var jedis: Jedis = _
override def open(partitionId: Long, version: Long): Boolean = {
val config: JedisPoolConfig = new JedisPoolConfig()
config.setMaxTotal(20)
config.setMaxIdle(5)
config.setMaxWaitMillis(1000)
config.setMinIdle(2)
config.setTestOnBorrow(false)
val jedisIp = "172.16.1.204"
val jedisPassword = "xxxxx"
jedisPool = new JedisPool(config, jedisIp, 6379, 60000, jedisPassword)
jedis = jedisPool.getResource
true
}
override def process(value: Row): Unit = {
//写入数据到redis
jedis.rpush("streamingTest", value.get(0) + "," + value.get(1) + "," + value.get(2))
}
override def close(errorOrNull: Throwable): Unit = {
//关闭连接
jedis.close()
}
}
HbaseSink组件
class HbaseSink extends ForeachWriterRow {
val table = "xxx"
var connection: Connection = _
var htable: Table = _
override def open(partitionId: Long, version: Long): Boolean = {
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("spark.executor.memory", "3000m")
conf.set("hbase.zookeeper.quorum", "192.168.1.11:2181,192.168.1.12:2181")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.set(TableInputFormat.INPUT_TABLE, table)
connection = ConnectionFactory.createConnection(conf)
htable = connection.getTable(TableName.valueOf(table))
true
}
override def process(value: Row): Unit = {
//以设备id为rowkey
val put = new Put(Bytes.toBytes(value.getString(0)))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("type"), Bytes.toBytes(value.getString(1)))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("score"), Bytes.toBytes(value.getDouble(2)))
val results = new ArrayAnyRef
htable.batchCallback(Collections.singletonList(put), results, new Batch.Callback[Result] {
override def update(bytes: Array[Byte], bytes1: Array[Byte], r: Result): Unit = {
println("Received callback for row[" + Bytes.toString(bytes1))
}
})
}
override def close(errorOrNull: Throwable): Unit = {
if (htable != null) {
htable.close()
}
}
}
主体结构部分程序编写:
object KafkaStreaming {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(s"${this.getClass.getSimpleName}").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel(LogLevel.ERROR.toString)
val topic = "kafka"
import spark.implicits._
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.1.10:6667,192.168.1.11:6667")
.option("subscribe", topic)
.load()
var batchId: Long = 0
//添加监听器,增加一个回调函数来异步监听数据变化。
spark.streams.addListener(new StreamingQueryListener() {
override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {}
override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
val progress: StreamingQueryProgress = event.progress
batchId = progress.batchId
val inputRowsPerSecond: Double = progress.inputRowsPerSecond
val processRowsPerSecond: Double = progress.processedRowsPerSecond
val numInputRows: Long = progress.numInputRows
println("batchId=" + batchId, " numInputRows=" + numInputRows + " inputRowsPerSecond=" + inputRowsPerSecond +
" processRowsPerSecond=" + processRowsPerSecond)
}
override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {}
})
val struct =
StructType(
StructField("device", StringType, nullable = true) ::
StructField("deviceType", StringType, nullable = false) ::
StructField("signal", DoubleType, nullable = false) :: Nil)
val kafkaDf: Dataset[(String, String)] = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").as[(String, String)]
val value = kafkaDf.filter(_._2.split(",").length == 3)
val deviceDf: DataFrame = value.map(line => {
val arr = line._2.split(",")
DeviceData(arr(0), arr(1), arr(2).toDouble)
}).withColumnRenamed("device", "id")
.withColumnRenamed("deviceType", "type")
.withColumnRenamed("signal", "score")
deviceDf.createOrReplaceTempView("test")
val frame = spark.sql("select * from test").where("score>0.5")
// val data = frame.groupBy("id").count()
val query = frame
.writeStream
.outputMode("append")
.foreach(new RedisSink())
.start()
query.awaitTermination()
}
}
//注意下面的参数是不能被设置的,否则kafka会抛出异常:
//group.id kafka的source会在每次query的时候自定创建唯一的group id
//auto.offset.reset 为了避免每次手动设置startingoffsets的值,structured streaming在内部消费时会自动管理offset。这样就能保//证订阅动态的topic时不会丢失数据。startingOffsets在流处理时,只会作用于第一次启动时,之后的处理都会自定的读取保存的//offset。
//key.deserializer,value.deserializer,key.serializer,value.serializer 序列化与反序列化,都是ByteArraySerializer
//enable.auto.commit kafka的source不会提交任何的offset
//interceptor.classes 由于kafka source读取数据都是二进制的数组,因此不能使用任何拦截器进行处理。
case class DeviceData(device: String, deviceType: String, signal: Double)