Kafka+SparkStreaming已经发展为一个比较成熟的实时日志收集与计算架构,利用Kafka,即可以支持将用于离线分析的数据流到HDFS,又可以同时支撑多个消费者实时消费数据,包括SparkStreaming。然而,在SparkStreaming程序中如果有复杂业务逻辑的统计,使用scala代码实现起来比较困难,也不易于别人理解。但如果在SparkSteaming中也使用SQL来做统计分析,是不是就简单的多呢?
本文介绍将SparkSQL与SparkStreaming结合起来,使用SQL完成实时的日志数据统计。SparkStreaming程序以yarn-cluster模式运行在YARN上,不单独部署Spark集群。
环境部署
Hadoop-2.6.0-cdh5.8.0(YARN)
spark-2.1.0-bin-hadoop2.6
kafka-0.10.2+kafka2.2.0
实时统计需求
以10秒为间隔,统计10秒内的各大区潜客的数量
pom
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming_${spark.artifact}artifactId>
<version>${spark.version}version>
<scope>${dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming-kafka-0-10_2.11artifactId>
<version>${spark.version}version>
<scope>${dependency.scope}scope>
dependency>
<dependency>
<groupId>com.oraclegroupId>
<artifactId>ojdbc6artifactId>
<version>11.2.0.3version>
<scope>${dependency.scope}scope>
dependency>
SparkStreaming程序代码
package com.chumi.dac.sp.stream.sparksqlcount
import com.chumi.dac.sp.stream.jdbc.DBCustomerStream
import com.chumi.dac.sp.stream.utils.DateUtil
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.{ SparkConf, SparkContext}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.streaming.dstream. InputDStream
/**
* Created by LHX on 2018/8/24 14:37.
*/
object CustomerStreamRsscCount {
/**
* BroadcastWrapper,用来注册广播变量。
*/
object BroadcastWrapper {
@volatile private var instance:Broadcast[String]=null
def getInstance(sc: SparkContext): Broadcast[String] = {
val point_time: String = DateUtil.getPointTime()
if (instance == null) {
synchronized {
if (instance == null) {
instance = sc.broadcast(point_time)
println("==初始化全局变量=="+point_time)
}
}
}
instance
}
def update(sc: SparkContext, blocking: Boolean = false,hv:String): Broadcast[String] = {
if (instance != null)
instance.unpersist(blocking)
instance = sc.broadcast(hv)
println("==更新=="+hv)
instance
}
}
/**
* SQLContextSingleton
*/
object SQLContextSingleton {
@transient private var instance: SQLContext = _
def getInstance(sparkContext: SparkContext): SQLContext = {
if (instance == null) {
instance = new SQLContext(sparkContext)
}
instance
}
}
case class DapLog(CIM_ID:String, ENTITY_CODE:String, CARD_FOUND_TIME:String)
def main(args: Array[String]) {
def functionToCreateContext(): StreamingContext = {
val conf = new SparkConf().setAppName("CustomerStreamRsscCount").setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(10))
val sqlContext = SQLContextSingleton.getInstance(ssc.sparkContext)
//要使用updateStateByKey方法,必须设置Checkpoint。
ssc.checkpoint("C:/tmp/checkPointPath")
//TM_SST
val jdbcMaps = Map("url" -> "jdbc:oracle:thin:@//IP:1521/test",
"user" -> "user",
"password" -> "password",
"dbtable" -> "TM_SST",
"driver" -> "oracle.jdbc.driver.OracleDriver")
val jdbcDFs = sqlContext.read.options(jdbcMaps).format("jdbc").load
jdbcDFs.createOrReplaceTempView("TM_SST")
//TM_RSSC
val jdbcMapc = Map("url" -> "jdbc:oracle:thin:@//IP:1521/test",
"user" -> "user",
"password" -> "password",
"dbtable" -> "TM_RSSC",
"driver" -> "oracle.jdbc.driver.OracleDriver")
val jdbcDFv = sqlContext.read.options(jdbcMapc).format("jdbc").load
jdbcDFv.createOrReplaceTempView("TM_RSSC")
val topics = "topic1" //stream_test01 topic1
val topicsSet = topics.split(",").toSet
val brokers = "IP:9095"
val kafkaParams = Map[String, Object]("bootstrap.servers" -> brokers
, "auto.offset.reset" -> "latest"
, "sasl.kerberos.service.name" -> "kafka"
, "key.deserializer" -> classOf[StringDeserializer]
, "value.deserializer" -> classOf[StringDeserializer]
, "group.id" -> "testgroup"
)
val dStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams))
val value = dStream.transform(rdd => {
val sqlC = SQLContextSingleton.getInstance(rdd.sparkContext)
import sqlContext.implicits._
val logDataFrame = rdd.map(w => {
val m: Array[String] = w.value().split(",")
DapLog(m(0), m(1), m(9))
}).toDF()
// 注册为tempTable
logDataFrame.createOrReplaceTempView("TT_CUSTOMER")
val sql = "select R.RSSC_ID,R.RSSC_NAME,COUNT(1) FROM TT_CUSTOMER T join TM_SST S on T.ENTITY_CODE = S.ENTITYCODE join TM_RSSC R ON S.RSSC_ID = R.RSSC_ID GROUP BY R.RSSC_ID,R.RSSC_NAME"
val data1: DataFrame = sqlC.sql(sql)
val a =data1.rdd.map{r =>(r(1).toString,r(2).toString.toInt) }
a
})
//将以前的数据和最新10s的数据进行求和
val addFunction = (currValues : Seq[Int],preVauleState : Option[Int]) => {
val currentSum = currValues.sum
val previousSum = preVauleState.getOrElse(0)
Some(currentSum + previousSum)
}
val total = value.updateStateByKey[Int](addFunction)
//输出总计的结果
total.print()
ssc
}
//重启streamingContext,读取以前保存的数据,否则创建新的StreamingContext
val context = StreamingContext.getOrCreate("checkPoint", functionToCreateContext _)
context.start()
context.awaitTermination()
}
}
总结
其中广播变量是后期根据时间筛选时候使用的,整体思路是先读取oracle数据并注册成临时表,后获取kafka数据,根据dStream.transform()方法把数据转换成想要的结果,最后用updateStateByKey()方法累加上一批次的统计结果。 对于初学者很多sparkstream方法还不是很熟悉,所以写代码想不到使用,如果对大家有所帮助,记得点赞哦~