1. 以时间戳查询消息
(1) Kafka 新版消费者基于时间戳索引消费消息
kafka 在 0.10.1.1 版本增加了时间索引文件,因此我们可以根据时间戳来访问消息。
如以下需求:从半个小时之前的offset处开始消费消息,代码示例如下:
package com.bonc.rdpe.kafka110.consumer;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndTimestamp;
import org.apache.kafka.common.PartitionInfo;
import org.apache.kafka.common.TopicPartition;
public class TimestampConsumer {
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "rdpecore4:9092,rdpecore5:9092,rdpecore6:9092");
props.put("group.id", "dev3-yangyunhe-topic001-group001");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
KafkaConsumer consumer = new KafkaConsumer<>(props);
String topic = "dev3-yangyunhe-topic001";
try {
// 获取topic的partition信息
List partitionInfos = consumer.partitionsFor(topic);
List topicPartitions = new ArrayList<>();
Map timestampsToSearch = new HashMap<>();
DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date now = new Date();
long nowTime = now.getTime();
System.out.println("当前时间: " + df.format(now));
long fetchDataTime = nowTime - 1000 * 60 * 30; // 计算30分钟之前的时间戳
for(PartitionInfo partitionInfo : partitionInfos) {
topicPartitions.add(new TopicPartition(partitionInfo.topic(), partitionInfo.partition()));
timestampsToSearch.put(new TopicPartition(partitionInfo.topic(), partitionInfo.partition()), fetchDataTime);
}
consumer.assign(topicPartitions);
// 获取每个partition一个小时之前的偏移量
Map map = consumer.offsetsForTimes(timestampsToSearch);
OffsetAndTimestamp offsetTimestamp = null;
System.out.println("开始设置各分区初始偏移量...");
for(Map.Entry entry : map.entrySet()) {
// 如果设置的查询偏移量的时间点大于最大的索引记录时间,那么value就为空
offsetTimestamp = entry.getValue();
if(offsetTimestamp != null) {
int partition = entry.getKey().partition();
long timestamp = offsetTimestamp.timestamp();
long offset = offsetTimestamp.offset();
System.out.println("partition = " + partition +
", time = " + df.format(new Date(timestamp))+
", offset = " + offset);
// 设置读取消息的偏移量
consumer.seek(entry.getKey(), offset);
}
}
System.out.println("设置各分区初始偏移量结束...");
while(true) {
ConsumerRecords records = consumer.poll(1000);
for (ConsumerRecord record : records) {
System.out.println("partition = " + record.partition() + ", offset = " + record.offset());
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
consumer.close();
}
}
}
运行结果:
当前时间: 2018-07-16 10:15:09
开始设置各分区初始偏移量...
partition = 2, time = 2018-07-16 09:45:10, offset = 727
partition = 0, time = 2018-07-16 09:45:09, offset = 727
partition = 1, time = 2018-07-16 09:45:10, offset = 727
设置各分区初始偏移量结束...
partition = 1, offset = 727
partition = 1, offset = 728
partition = 1, offset = 729
......
partition = 2, offset = 727
partition = 2, offset = 728
partition = 2, offset = 729
......
partition = 0, offset = 727
partition = 0, offset = 728
partition = 0, offset = 729
......
- 说明:基于时间戳查询消息,consumer 订阅 topic 的方式必须是 Assign
(2) Spark基于kafka时间戳索引读取数据并加载到RDD中
以下为一个通用的,spark读取kafka中某段时间之前到执行程序此刻的时间范围内的数据并加载到RDD中的方法:
package com.bonc.utils
import org.apache.kafka.clients.consumer.KafkaConsumer
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{KafkaUtils, OffsetRange}
import scala.collection.JavaConversions._
/**
* Author: YangYunhe
* Description:
* Create: 2018-06-29 11:35
*/
object SparkKafkaUtils {
/**
* 从 Kafka 中取数据加载到 RDD 中
* @param sc SparkContext
* @param topic Kafka 的 Topic
* @param numDays 取距离此刻多少天之前的数据,例如,这个参数为 3,那么取此刻和3天之前相同时刻范围内的数据
* @param kafkaParams Kafka的配置参数,用于创建生产者和作为参数传给 KafkaUtils.createRDD
* @return
*/
def createKafkaRDDByTimeRange(sc: SparkContext, topic: String, numDays: Int, kafkaParams: java.util.HashMap[String, Object]): RDD[String] = {
val startFetchTime = DateUtils.daysAgo(numDays)
val startFetchTimeStr = DateUtils.parseLong2String(startFetchTime, DateUtils.DATE_TIME_FORMAT_STR)
println(s"starting fetch data in kafka with time range [${startFetchTimeStr}——${DateUtils.nowStr()}]")
val consumer = new KafkaConsumer[String, String](kafkaParams)
val partitionInfos = consumer.partitionsFor(topic)
val topicPartitions = scala.collection.mutable.ArrayBuffer[TopicPartition]()
val timestampsToSearch = scala.collection.mutable.Map[TopicPartition, java.lang.Long]()
val offsetRanges = scala.collection.mutable.ArrayBuffer[OffsetRange]()
for(partitionInfo <- partitionInfos) {
topicPartitions += new TopicPartition(partitionInfo.topic, partitionInfo.partition)
}
val topicPartitionLongMap = consumer.endOffsets(topicPartitions)
for(topicPartition <- topicPartitions) {
timestampsToSearch(topicPartition) = startFetchTime
}
val topicPartitionOffsetAndTimestampMap = consumer.offsetsForTimes(timestampsToSearch)
for((k, v) <- topicPartitionOffsetAndTimestampMap) {
offsetRanges += OffsetRange.create(topic, k.partition(), v.offset(), topicPartitionLongMap.get(k))
}
KafkaUtils.createRDD[String, String](sc, kafkaParams, offsetRanges.toArray, PreferConsistent).map(_.value)
}
}
使用方法:
def main(args: Array[String]): Unit = {
val kafkaParams = new JHashMap[String, Object]()
kafkaParams.put("bootstrap.servers", bootstrapServers)
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
// 这里就取到了kafka中3天的数据到RDD中
val rdd = SparkKafkaUtils.createKafkaRDDByTimeRange(sc, "topic", 3, kafkaParams)
rdd.map(x => {
// 其他操作
......
})
}
2. 消费速度控制
在有些场景可以需要暂停某些分区消费,达到一定条件再恢复对这些分区的消费,可以使用pause()方法暂停消费,resume()方法恢复消费,示例代码如下:
package com.bonc.rdpe.kafka110.consumer;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collections;
import java.util.Properties;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
/**
* @author YangYunhe
* @date 2018-07-16 15:13:11
* @description: 消费速度控制
*/
public class PauseAndResumeConsumer {
private static final DateFormat df = new SimpleDateFormat("HH");
public static String getTimeRange() {
long now = System.currentTimeMillis();
String hourStr = df.format(now);
int hour;
if(hourStr.charAt(0) == '0') {
hour = Integer.parseInt(hourStr.substring(1, 1));
}else {
hour = Integer.parseInt(hourStr);
}
if(hour >= 0 && hour < 8) {
return "00:00-08:00";
}else if(hour >= 8 && hour < 16) {
return "08:00-16:00";
}else {
return "16:00-00:00";
}
}
public static void main(String[] args) throws Exception {
Properties props = new Properties();
props.put("bootstrap.servers", "rdpecore4:9092,rdpecore5:9092,rdpecore6:9092");
props.put("group.id", "dev3-yangyunhe-topic001-group003");
props.put("auto.offset.reset", "earliest");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
KafkaConsumer consumer = new KafkaConsumer<>(props);
TopicPartition partition0 = new TopicPartition("dev3-yangyunhe-topic001", 0);
TopicPartition partition1 = new TopicPartition("dev3-yangyunhe-topic001", 1);
TopicPartition partition2 = new TopicPartition("dev3-yangyunhe-topic001", 2);
consumer.assign(Arrays.asList(new TopicPartition[]{partition0, partition1, partition2}));
try {
while (true) {
// 00:00-08:00从partition0读取数据
if(getTimeRange() == "00:00-08:00") {
consumer.pause(Arrays.asList(new TopicPartition[]{partition1, partition2}));
consumer.resume(Collections.singletonList(partition0));
// 08:00-16:00从partition1读取数据
}else if(getTimeRange() == "08:00-16:00") {
consumer.pause(Arrays.asList(new TopicPartition[]{partition0, partition2}));
consumer.resume(Collections.singletonList(partition1));
// 16:00-00:00从partition2读取数据
}else {
consumer.pause(Arrays.asList(new TopicPartition[]{partition0, partition1}));
consumer.resume(Collections.singletonList(partition2));
}
ConsumerRecords records = consumer.poll(1000);
for (ConsumerRecord record : records) {
System.out.println("topic = " + record.topic() + ", partition = " + record.partition());
System.out.println("offset = " + record.offset());
}
}
} finally {
consumer.close();
}
}
}
结果:(我运行程序的时间是18:27,所以只会消费partition2中的消息)
topic = dev3-yangyunhe-topic001, partition = 2
offset = 0
topic = dev3-yangyunhe-topic001, partition = 2
offset = 1
topic = dev3-yangyunhe-topic001, partition = 2
offset = 2
......
- 说明:如果需要暂停或者恢复某分区的消费,consumer 订阅 topic 的方式必须是 Assign