import cn.qphone.spark.`trait`.LoggerTrait
import cn.qphone.spark.utils.{CommonUtils, SparkUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import scala.collection.{JavaConversions, mutable}
object Demo6_SparkStreaming_Kafka_Zookeeper extends LoggerTrait{
// zookeeper的客户端
val client = {
val client = CuratorFrameworkFactory.builder()
.connectString("hbase1,hbase2,hbase3")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("kafka/consumers/offsets")
.build()
client.start()
client
}
def main(args: Array[String]): Unit = {
//1. 入口
val ssc = SparkUtils.getLocalStreamingContext("Demo6_SparkStreaming_Kafka_Zookeeper", 2)
val kafkaParams = CommonUtils.toMap("demo6.properties")
val topics = "bjbigdata1909-1".split(",").toSet
//2. 加载数据
val messages:InputDStream[(String, String)] = createMsg(ssc, kafkaParams, topics)
//3. 遍历消息
messages.foreachRDD((rdd, btime) => {
if (!rdd.isEmpty()) {
//3.1 将偏移量读取到东西打印
println("-"*100)
println(s"bTime = ${btime}")
println("#"*50 + " " + rdd.count())
//3.2 保存最新的偏移量到zookeeper
storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 从zk中读取手动保存offset信息,然后从kafka指定offset位置开始读取数据,如果没有读取到offset信息,那么从开始位置开始读取信息
*/
def createMsg(ssc:StreamingContext, kafkaParams:Map[String, String], topics:Set[String]):InputDStream[(String, String)] = {
//1. 从zookeeper中读取offset信息
val fromOffsets:Map[TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams("group.id"))
//2. 读取外部数据
var messages:InputDStream[(String, String)] = null
//2.1 判断
if (fromOffsets.isEmpty) { // 如果没有读取到偏移量,说明之前从来没有保存过,从开始的位置开始读取
messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}else { //读取到了偏移量,从指定位置开始读取
//2.2 创建messageHandler
val messageHandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
//2.3 读取指定位置的offset的数据
messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
}
messages
}
/**
* 根据主题和消费者组来获取到对应的偏移量
* 首先我们定义了offset保存在zookeeper的哪一个目录下:/kafka/topic/group/partition
* 其次,partition中保存了的数据就是我们的offset
*/
def getFromOffsets(topics:Set[String], group:String) : Map[TopicAndPartition, Long] = {
//1. 定义一个结构专门保存偏移量
val offsets = mutable.Map[TopicAndPartition, Long]()
//2. 遍历主题
for(topic <- topics) {
//2.1 自定义offset在zookeeper的位置
val path = s"${topic}/${group}"
//2.2 判断zookeeper中此path路径是否存在
isExists(path)
//2.3 遍历获取分区:还需要将java的数组转换位scala的数组
for(partition <- JavaConversions.asScalaBuffer(client.getChildren.forPath(path))) {
//2.3.1 这个路径是用来保存偏移量
val fullPath = s"${path}/${partition}"
//2.3.2 获取偏移量
val offset = new String(client.getData.forPath(fullPath)).toLong
//2.3.3 数据保存offsets
offsets.put(TopicAndPartition(topic, partition.toInt), offset)
}
}
offsets.toMap
}
/**
* 判断节点是否存在,如果不存在就创建之
*/
def isExists(path:String):Unit = {
if (client.checkExists().forPath(path) == null) { // 如果路径不存在
client.create().creatingParentsIfNeeded().forPath(path)
}
}
/**
* 将偏移量保存会zookeeper
*/
def storeOffsets(offsetRanges:Array[OffsetRange], group:String) = {
//1. 遍历偏移量范围的数组
for(offsetRange <- offsetRanges) {
//2. 获取主题分区以及偏移量
val topic = offsetRange.topic
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
//3. 创建保存在zookeeper上的目录
val path = s"${topic}/${group}/${partition}"
isExists(path)
//4. 保存偏移量到partition
client.setData().forPath(path, untilOffset.toString.getBytes())
}
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.junit.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
/**
* HBase连接的客户端工具类
*/
public class HBaseUtils {
// pool就是存放hbase的连接对象的连接池
private static LinkedList pool = new LinkedList<>();
// 初始化5条hbase的连接对象到连接池中
static {
try {
Configuration conf = new Configuration();
conf.set("hbase.rootdir", "hdfs://hbase1:9000");
conf.set("hbase.cluster.distributed", "true");
conf.set("hbase.zookeeper.quorum", "hbase1,hbase2,hbase3");
conf.set("hbase.regionserver.wal.codec", "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec");
for (int i = 0;i < 5;i++) {
pool.push(ConnectionFactory.createConnection(conf));
}
}catch (Exception e) {
e.printStackTrace();
}
}
// 获取连接对象
public static Connection getConnection() {
while (pool.isEmpty()) {
try {
System.out.println("connection pool is null, please wait for a moment~~~");
Thread.sleep(1000);
}catch (Exception e) {
e.printStackTrace();
}
}
return pool.poll();
}
// 释放连接对象,将连接对象归还给连接池
public static void release(Connection connection) {
pool.push(connection);
}
// 根据参数创建表
public static Map getColValue(Connection connection, TableName tableName, byte[] rk, byte[] cf) {
//1. 声明map存放最终结果
Map partition2Offset = new HashMap<>();
try {
//2. 获取到表对象
Table table = connection.getTable(tableName);
Scan scan = new Scan();
//3. 条件
Filter filter = new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(rk));
scan.setFilter(filter);
//4. 创建扫描器
ResultScanner scanner = table.getScanner(scan);
//5. 遍历
for (Result result : scanner) {
List cells = result.listCells(); // 获取到每一个cell(k,v)
for (Cell cell : cells) {
//col
byte[] column = CellUtil.cloneQualifier(cell);
//value
byte[] values = CellUtil.cloneValue(cell);
int partition = Integer.valueOf(new String(column));
long offset = Long.valueOf(new String(values));
partition2Offset.put(partition, offset);
}
}
return partition2Offset;
}catch (Exception e) {
e.printStackTrace();
}
return null;
}
//将col和value设置到hbase
public static void set(Connection connection, TableName tableName, byte[] rk, byte[] cf, byte[] col, byte[] value) {
try {
Table table = connection.getTable(tableName);
Put put = new Put(rk);
put.addColumn(cf, col, value);
table.put(put);
table.close();
}catch (Exception e) {
e.printStackTrace();
}
}
} |
import cn.qphone.hbase.utils.HBaseUtils
import cn.qphone.spark.`trait`.LoggerTrait
import cn.qphone.spark.utils.{CommonUtils, SparkUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import scala.collection.{JavaConversions, mutable}
/**
* 一 使用hbase来手动管理offset信息,保证数据被依次消费
* 1. 有:从指定的offset位置开始消费
* 2. 没有:从offset为0开始消费
*
* 二 使用指定的offset向kafka拉取数据
* 三 拉取到数据之后进行业务处理
* 四 offset需要重新更新到hbase
*
* create 'spark-topic-offset', 'cf'
*
* rowkey:topic-group
* column:partition:offset
* */
object Demo7_SparkStreaming_Kafka_HBase extends LoggerTrait{
def main(args: Array[String]): Unit = {
//1. 入口
val ssc = SparkUtils.getLocalStreamingContext("Demo6_SparkStreaming_Kafka_Zookeeper", 2)
val kafkaParams = CommonUtils.toMap("demo6.properties")
val topics = "bjbigdata1909-1".split(",").toSet
//2. 加载数据
val messages:InputDStream[(String, String)] = createMsg(ssc, kafkaParams, topics)
//3. 遍历消息
messages.foreachRDD((rdd, btime) => {
if (!rdd.isEmpty()) {
//3.1 将偏移量读取到东西打印
println("-"*100)
println(s"bTime = ${btime}")
println("#"*50 + " " + rdd.count())
//3.2 保存最新的偏移量到zookeeper
storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 从hbase中读取手动保存offset信息,然后从kafka指定offset位置开始读取数据,如果没有读取到offset信息,那么从开始位置开始读取信息
*/
def createMsg(ssc:StreamingContext, kafkaParams:Map[String, String], topics:Set[String]):InputDStream[(String, String)] = {
//1. 从hbase中读取offset信息
val fromOffsets:Map[TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams("group.id"))
//2. 读取外部数据
var messages:InputDStream[(String, String)] = null
//2.1 判断
if (fromOffsets.isEmpty) { // 如果没有读取到偏移量,说明之前从来没有保存过,从开始的位置开始读取
messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}else { //读取到了偏移量,从指定位置开始读取
//2.2 创建messageHandler
val messageHandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
//2.3 读取指定位置的offset的数据
messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
}
messages
}
/**
* 根据主题和消费者组来获取到对应的偏移量
* 首先我们定义了offset保存在hbase:
* 其次,partition中保存了的数据就是我们的offset
*/
def getFromOffsets(topics:Set[String], group:String) : Map[TopicAndPartition, Long] = {
//1. 定义一个结构专门保存偏移量
val offsets = mutable.Map[TopicAndPartition, Long]()
//1.1 获取到HBase connection
val connection = HBaseUtils.getConnection
val tableName = TableName.valueOf("spark-topic-offset")
val cf = Bytes.toBytes("cf")
//2. 遍历主题
for(topic <- topics) {
//2.1 自定义rowkey
val rk = s"${topic}-${group}".getBytes()
//2.2 获取表的分区以及对应的偏移量
val partition2Offsets = HBaseUtils.getColValue(connection, tableName, rk, cf)
val partition2Offsets2 = JavaConversions.mapAsScalaMap(partition2Offsets)
//2.3 遍历获取分区:还需要将java的数组转换位scala的数组
for ((k, v) <- partition2Offsets2) {
offsets.put(TopicAndPartition(topic, (k+"").toInt), v)
}
}
HBaseUtils.release(connection)
offsets.toMap
}
/**
* 将偏移量保存会hbase
*/
def storeOffsets(offsetRanges:Array[OffsetRange], group:String) = {
//0.
val connection = HBaseUtils.getConnection
val tableName = TableName.valueOf("spark-topic-offset")
val cf = Bytes.toBytes("cf")
//1. 遍历偏移量范围的数组
for(offsetRange <- offsetRanges) {
//2. 获取主题分区以及偏移量
val rk = s"${offsetRange.topic}-${group}".getBytes()
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
//3. 将结果保存到hbase
HBaseUtils.set(connection, tableName, rk, cf, (partition+"").getBytes(), (untilOffset+"").getBytes())
}
}
}
package sparkStrreaming.day3
import java.util
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.CommonUtils
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import redis.clients.jedis.Jedis
import scala.collection.{JavaConversions, mutable}
object Demo8_SparkStreaming_Kafka_Redis {
val jedis = new Jedis("mini1",6379)
def main(args: Array[String]): Unit = {
val ssc = new StreamingContext(new SparkConf().setMaster("local[*]").setAppName("redis"),Seconds(2))
val kafkaParams: Map[String, String] = CommonUtils.toMap("demo6.properties")
val topics="test1".split("\\s+").toSet
val messages: InputDStream[(String, String)] = createMsg(ssc,kafkaParams,topics)
messages.foreachRDD((rdd,btime)=>{
if(!rdd.isEmpty()){
println("-"*100)
println(s"btime=${btime}")
println("#"*50)
println(rdd.count())
updateOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
def createMsg(ssc:StreamingContext,kafkaParams :Map[String,String],topics:Set[String]):InputDStream[(String,String)]={
val fromOffsets: Map[TopicAndPartition, Long] = getFromOffsets(topics,kafkaParams("group.id"))
var messages:InputDStream[(String,String)]=null
if(fromOffsets.isEmpty){
messages = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
}else{
val msgHandler=(msgHandler:MessageAndMetadata[String,String])=>(msgHandler.key(),msgHandler.message())
messages=KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,(String,String)](ssc,kafkaParams,fromOffsets,msgHandler)
}
messages
}
def getFromOffsets(topics: Set[String], group: String): Map[TopicAndPartition,Long]={
var offsets=mutable.Map[TopicAndPartition,Long]()
for(topic<-topics){
val key=s"${topic}_${group}"
// val str: String = jedis.get(key).toString
val string: util.Map[String, String] = jedis.hgetAll(key)
for( partition<-JavaConversions.mapAsScalaMap(string)){
offsets.put(TopicAndPartition(topic,partition._1.toInt),partition._2.toLong)
}
}
offsets.toMap
}
def updateOffsets(offsetRanges: Array[OffsetRange], group: String) = {
for(offsetRange<-offsetRanges){
val topic=offsetRange.topic
val partition=offsetRange.partition
val offset=offsetRange.untilOffset
jedis.hset(s"${topic}_${group}",partition.toString,offset.toString)
}
}
}