1、测试代码,使用三种方法保存kafka的offset(未优化版)
package kafka.comsumer
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.PropUtil
/**
* @author yanghb
* @date 2019/7/25 10:03
* @description:使用kafka自己维护offset,读取多个topic
*/
object KafkaOffset {
//加载配置变量
val prop = new PropUtil("config.properties")
val oracleUrl = prop.getProp("ORACLE_URL")
val oracleUser = prop.getProp("ORACLE_USER")
val oraclePassword = prop.getProp("ORACLE_PASSWORD")
val brokers = prop.getProp("KAFKA_BROKERS")
val groupName:String = this.getClass.getName
def main(args: Array[String]): Unit = {
//获取SparkSession连接
val spark = SparkSession.builder().appName(groupName).master("local[4]").getOrCreate()
// val spark = SparkSession.builder().appName("SparkToOracleStatus").getOrCreate()
val sc = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(5))
//读取的topic
// val topics=Array("DC_HISTORY_STATUS_T2","DC_HISTORY_STATUS_T")
val topics = Array("testTopic2")
//配置kafka参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupName,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
//创建数据流
val messages = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
var offsetRanges = Array[OffsetRange]()
//kafkaStreamDate.foreachRDD里面的业务逻辑是在Driver端执行
messages.foreachRDD { kafkaRDD =>
//判断当前的 kafkaStream 中的RDD是否有数据
if (!kafkaRDD.isEmpty()) {
// try{
//
// }catch {
// case e:Throwable => e.printStackTrace()
// }
//只有KafkaRDD可以强转成HasOffsetRanges,并获取到偏移量
offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
//获取message中的具体数据
val kafkaData: RDD[String] = kafkaRDD.map(_.value())
//todo 此行可注释
for (o <- offsetRanges) {
println(o)
}
// 数据处理,解析json数据,转换时间戳
kafkaData.foreachPartition(rdds => {
// 数据推送,将数据批量推送到oracle中
rdds.foreach(x => {
println(x)
})
})
// 更新偏移量。数据处理完更新偏移量到kafkagroup中
messages.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
}
ssc.start()
ssc.awaitTermination()
}
}
package kafka.comsumer
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Jedis
import utils.PropUtil
/**
* @author yanghb
* @date 2019/7/25 10:03
* @description :topic默认分区3; 每个topic都会建立一个连接;
* 如要上线后又重新修改分区,或增加topic,则需要删除redis,重新拉取数据;
*/
object RedisOffset {
//加载配置变量
val prop = new PropUtil("config.properties")
val oracleUrl = prop.getProp("ORACLE_URL")
val oracleUser = prop.getProp("ORACLE_USER")
val oraclePassword = prop.getProp("ORACLE_PASSWORD")
val brokers = prop.getProp("KAFKA_BROKERS")
val groupName: String = this.getClass.getName
def main(args: Array[String]): Unit = {
//获取SparkSession连接,没有则创建
val spark = SparkSession.builder().appName(groupName).master("local[4]").getOrCreate()
// val spark = SparkSession.builder().appName("SparkToOracleStatus").getOrCreate()
val sc = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(5))
//读取的topic
// val topics=Array("DC_HISTORY_STATUS_T2","DC_HISTORY_STATUS_T")
val topics = Array("testTopic2","DC_HISTORY_STATUS_T","DC_HISTORY_STATUS_T_2")
//配置kafka参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupName,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
var fromOffsets: Map[TopicPartition, Long] = Map()
//redis连接,也可使用连接池
val js = new Jedis("10.1.168.140")
//使用try关闭js连接
try {
//todo 默认判断存不存在第一个topic的分区,待优化,循环遍历topic进行判断
if (js.exists(topics(0) + "-0")) {
//循环遍历topic
for (i <- 0 until topics.length) {
if (js.exists(topics(i) + "-0")) {
// todo 此处的partition分区数量需修改,改为最多的分区数量,设定每个topic的分区数为5,将每个topic的offset按照map方式添加到fromOffsets中
for (j <- 0 until 5) {
//判断key存不存在,存在则读取offset
if (js.exists(topics(i) + "-" + j)) {
val tp = new TopicPartition(topics(i), j)
val offset: Long = js.get(topics(i) + "-" + j).toLong
fromOffsets += (tp -> offset)
}
}
}
//根据fromOffsets创建数据流
kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets))
println("使用offset建立连接")
}
} else {
//第一次运行,新建数据流
kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
println("新建连接")
}
var offsetRanges = Array[OffsetRange]()
//kafkaStreamDate.foreachRDD里面的业务逻辑是在Driver端执行
kafkaStream.foreachRDD { kafkaRDD =>
//判断当前的 kafkaStream 中的RDD是否有数据
if (!kafkaRDD.isEmpty()) {
//只有KafkaRDD可以强转成HasOffsetRanges,并获取到偏移量
offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
//获取message中的具体数据
val kafkaData: RDD[String] = kafkaRDD.map(_.value())
kafkaData.foreachPartition(da => {
da.foreach(x => {
// println(x)
})
})
// todo 向redis提交偏移量
for (o <- offsetRanges) {
js.set(o.topic + "-" + o.partition, o.untilOffset.toString)
println(o)
}
}
}
//redis的try,用于关闭redis连接
} catch {
case ex: Exception => println(ex)
} finally {
js.close()
}
ssc.start()
ssc.awaitTermination()
}
}
package kafka.comsumer
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.PropUtil
/**
* @author yanghb
* @date 2019/7/25 10:03
* @description: Checkpoint可以同时维护多个topic的offset,并确保可以读取一次,但是代码改动则需要清空checkpoints
*/
object CheckpointOffset {
//获取参数
val prop = new PropUtil("config.properties")
val oracleUrl = prop.getProp("ORACLE_URL")
val oracleUser = prop.getProp("ORACLE_USER")
val oraclePassword = prop.getProp("ORACLE_PASSWORD")
val brokers = prop.getProp("KAFKA_BROKERS")
// val checkpointDir = prop.getProp("checkpointDir")
val checkpointDir = "./CheckpointOffset"
val groupName: String = this.getClass.getName
def functionToCreateContext(): StreamingContext = {
//获取SparkSession连接,没有则创建
val spark = SparkSession.builder().appName(groupName).master("local[3]").getOrCreate()
// val spark = SparkSession.builder().appName("SparkToOracleStatus").getOrCreate()
val sc = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(3))
ssc.checkpoint(checkpointDir)
//读取的topic
val topics = Array("testTopic2", "DC_HISTORY_STATUS_T")
//配置kafka参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupName,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
kafkaStream.foreachRDD(kafkaRDD => {
//todo 可注释
val offsetRanges: Array[OffsetRange] = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
for (o <- offsetRanges) {
println(o)
}
//获取message信息
val value: RDD[String] = kafkaRDD.map(x => {
x.value()
})
//逻辑处理
value.foreachPartition(rdds => {
rdds.foreach(x => {
println(x)
})
})
})
ssc
}
def main(args: Array[String]): Unit = {
// 创建context
val context = StreamingContext.getOrCreate(checkpointDir, functionToCreateContext _)
// 启动流计算
context.start()
context.awaitTermination()
}
}
2、utils
package utils
import java.io.InputStream
import java.util.Properties
/**
* 读取配置文件信息
* @param file
*/
class PropUtil(val file: String) {
var prop = new Properties()
def getProp(key: String): String = {
val ipStream: InputStream = this.getClass.getResourceAsStream("/config.properties")
prop.load(ipStream)
prop.getProperty(key)
}
}
3、pom
2.11.8
2.2.0
3.0.0
2.0.0
12.1.0.2
org.apache.kafka
kafka_2.11
1.0.0
com.fasterxml.jackson.core
*
com.fasterxml.jackson.core
jackson-core
2.6.6
org.scala-lang
scala-library
${scala.version}
org.apache.spark
spark-core_2.11
${spark.version}
org.apache.hadoop
hadoop-client
${hadoop.version}
org.apache.hadoop
hadoop-common
${hadoop.version}
org.apache.hadoop
hadoop-mapreduce-client-core
${hadoop.version}
org.apache.spark
spark-sql_2.11
${spark.version}
org.apache.spark
spark-hive_2.11
${spark.version}
org.apache.spark
spark-streaming_2.11
${spark.version}
org.apache.spark
spark-streaming-kafka-0-10_2.11
2.2.0
junit
junit
4.12
compile
com.alibaba
fastjson
1.2.47
com.github.noraui
ojdbc7
${ojdbc7}
redis.clients
jedis
2.9.0
src/main/scala
src/test/scala
net.alchim31.maven
scala-maven-plugin
3.2.2
compile
testCompile
-dependencyfile
${project.build.directory}/.scala_dependencies
org.apache.maven.plugins
maven-shade-plugin
2.3
package
shade
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA