在工程中需要引入Maven工件spark-streaming-kafka-0-8_2.11来使用它。包内提供的 KafkaUtils对象可以在StreamingContext和JavaStreamingContext中以你的Kafka消息创建出 DStream。
两个核心类:KafkaUtils、KafkaCluster(保存zk)
1)需求:通过SparkStreaming从Kafka读取数据,并将读取过来的数据做简单计算(WordCount),最终打印到控制台。
(1)导入依赖
(2)编写代码
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
object KafkaStreaming {
def main(args: Array[String]): Unit = {
//创建SparkConf对象
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("KafkaStreaming")
//创建StreamingContext对象
val ssc: StreamingContext = new StreamingContext(sparkConf, Seconds(3))
//kafka参数声明
val brokers = "hadoop102:9092,hadoop103:9092,hadoop104:9092"
val topic = "first"
val group = "bigdata"
//消费者需要一个反序列化的类
val deserialization = "org.apache.kafka.common.serialization.StringDeserializer"
//定义Kafka参数
val kafkaPara: Map[String, String] = Map[String, String](
ConsumerConfig.GROUP_ID_CONFIG -> group,
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserialization,
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserialization
)
//创建KafkaCluster(维护offset)
val kafkaCluster = new KafkaCluster(kafkaPara)
//获取ZK中保存的offset
val fromOffset: Map[TopicAndPartition, Long] = getOffsetFromZookeeper(kafkaCluster, group, Set(topic))
//读取kafka数据创建DStream
val kafkaDStream: InputDStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](ssc,
kafkaPara,
fromOffset,
(x: MessageAndMetadata[String, String]) => x.message())
//数据处理
kafkaDStream.print
//提交offset
offsetToZookeeper(kafkaDStream, kafkaCluster, group)
ssc.start()
ssc.awaitTermination()
}
//从ZK获取offset
def getOffsetFromZookeeper(kafkaCluster: KafkaCluster, kafkaGroup: String, kafkaTopicSet: Set[String]): Map[TopicAndPartition, Long] = {
// 创建Map存储Topic和分区对应的offset
val topicPartitionOffsetMap = new mutable.HashMap[TopicAndPartition, Long]()
// 获取传入的Topic的所有分区
// Either[Err, Set[TopicAndPartition]] : Left(Err) Right[Set[TopicAndPartition]]
val topicAndPartitions: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(kafkaTopicSet)
// 如果成功获取到Topic所有分区
// topicAndPartitions: Set[TopicAndPartition]
if (topicAndPartitions.isRight) {
// 获取分区数据
// partitions: Set[TopicAndPartition]
val partitions: Set[TopicAndPartition] = topicAndPartitions.right.get
// 获取指定分区的offset
// offsetInfo: Either[Err, Map[TopicAndPartition, Long]]
// Left[Err] Right[Map[TopicAndPartition, Long]]
val offsetInfo: Either[Err, Map[TopicAndPartition, Long]] = kafkaCluster.getConsumerOffsets(kafkaGroup, partitions)
if (offsetInfo.isLeft) {
// 如果没有offset信息则存储0
// partitions: Set[TopicAndPartition]
for (top <- partitions)
topicPartitionOffsetMap += (top -> 0L)
} else {
// 如果有offset信息则存储offset
// offsets: Map[TopicAndPartition, Long]
val offsets: Map[TopicAndPartition, Long] = offsetInfo.right.get
for ((top, offset) <- offsets)
topicPartitionOffsetMap += (top -> offset)
}
}
topicPartitionOffsetMap.toMap
}
//提交offset
def offsetToZookeeper(kafkaDstream: InputDStream[String], kafkaCluster: KafkaCluster, kafka_group: String): Unit = {
kafkaDstream.foreachRDD {
rdd =>
// 获取DStream中的offset信息
// offsetsList: Array[OffsetRange]
// OffsetRange: topic partition fromoffset untiloffset
val offsetsList: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 遍历每一个offset信息,并更新Zookeeper中的元数据
// OffsetRange: topic partition fromoffset untiloffset
for (offsets <- offsetsList) {
val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
// ack: Either[Err, Map[TopicAndPartition, Short]]
// Left[Err]
// Right[Map[TopicAndPartition, Short]]
val ack: Either[Err, Map[TopicAndPartition, Short]] = kafkaCluster.setConsumerOffsets(kafka_group, Map((topicAndPartition, offsets.untilOffset)))
if (ack.isLeft) {
println(s"Error updating the offset to Kafka cluster: ${ack.left.get}")
} else {
println(s"update the offset to Kafka cluster: ${offsets.untilOffset} successfully")
}
}
}
}
}