spark streaming现在主要分为两个部分,一个是DStreams,另一个就是Structured Streaming,前一个是基于RDD进行编程,后一个是基于DataFrame或Dataset编程的。现在官方推荐的是使用Structured Streaming,因此可以根据需要自己选择使用哪一个。这里主要说明pyspark streaming连接Kafka的方式及解决无法使用group id的问题。
使用的版本:spark2.4.3, scala2.11,本地kafka2.1.0和线上kafka0.10
test.py
:
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext
from pyspark import SparkContext
if __name__ == "__main__":
sc = SparkContext(appName="test")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 20)
kafka_params = {"metadata.broker.list": "xxxxx:9092,xxxxx:9092,xxxxx:9092"}
kafkaStream = KafkaUtils.createDirectStream(ssc, ["mytopic"], kafka_params,
valueDecoder=lambda x: json.loads(x.decode("utf-8")))
kafkaStream.map(lambda x: (x[1].get("userId"), len(x[1].get("lifeIds")), x[1].get("createDate"))).pprint()
ssc.start()
ssc.awaitTermination()
运行:
spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.3 test.py
上面的KafkaUtil在spark2.3.0就deprecated了,但还是能够使用,不过需要使用org.apache.spark:spark-streaming-kafka-0-8_2.11
,而不能使用org.apache.spark:spark-streaming-kafka-0-10_2.11
。因为现在python spark streaming不支持Kafka0.10.如果想使用Kafka0.10就需要使用Structured Streaming了。
test.py
:
spark = SparkSession \
.builder \
.appName("StructuredNetworkWordCount") \
.getOrCreate()
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "test") \
.load()
# df = df.rdd.map(lambda x: x.split(" ")).toDF()
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
df = df.withColumn("s", F.split(df['value'], " "))
df = df.withColumn('e', F.explode(df['s']))
q = df.writeStream \
.format("console") \
.trigger(processingTime='30 seconds')\
.start()
q.awaitTermination()
运行:
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 test.py
上面两个最主要的问题就是不能设置group id,无法追踪自己最新消费的offset,经过测试,每次启动脚本的时候都是从当前时间开始消费数据的,也就是说以前产生的数据可能消费不到,并且pyspark DStreams不支持线上的kafka0.10,所以现在想到的方法就是自己手动设置并保存offset。
手动设置offset的思想:
(1) 保存Kafka offset至Hbase,在Hbase中创建一张表,用来存放Kafka的offset数据,形式如下:
# DDL:
create 'stream_kafka_offsets', {NAME=>'offsets', TTL=>2592000}
# ROW LAYOUT:
row: _
column family: offsets
qualifier:
value:
设置了TTL为30天,row key使用了topic和group id,列族为offsets,列为partition id,值为offset
(2) 对每一批Kafka中消费的数据,保存最新offset至Hbase
def save_offsets(topic_name, group_id, offset_ranges, hbase_table_name):
happybase_util = HappyBaseUtil()
for offset in offset_ranges:
happybase_util.put(hbase_table_name, topic_name+"_"+group_id, {"offsets:"+str(offset.partition): str(offset.untilOffset)})
这一步比较简单,就是把结果存到Hbase中去。
(3) 获取最新的offset
这个需要考虑几种情况:
kafka
。def get_last_committed_offsets(topic_name, group_id, hbase_table_name):
# client = SimpleClient('localhost:9092')
client = SimpleClient(["xxxxx:9092","xxxxx:9092","xxxxx:9092"])
# 获取zookeeper中kafka topic的partition
topic_partition_ids = client.get_partition_ids_for_topic(topic_name)
happybase_util = HappyBaseUtil()
# 获取hbase存放的kafka topic的partition
partition_offset_values = happybase_util.get_row(hbase_table_name, row=topic_name+"_"+group_id)
if len(partition_offset_values) == 0:
# 第一次运行处理
partitions = client.topic_partitions[topic_name]
offset_requests = [OffsetRequestPayload(topic_name, p, -1, 1) for p in partitions.keys()]
offsets_responses = client.send_offset_request(offset_requests)
offsets = dict((TopicAndPartition(topic_name, r.partition), r.offsets[0]) for r in offsets_responses)
elif len(partition_offset_values) < len(topic_partition_ids):
# 如果hbase中partition个数小于zookeeper中partition的个数,说明有新增的partition,新增的partition偏移量设为0
offsets = dict((TopicAndPartition(topic_name, int(k.decode("utf-8").split(":")[1])), int(v))
for k, v in partition_offset_values.items())
extra_partitions = dict((TopicAndPartition(topic_name, i), 0)
for i in range(len(topic_partition_ids), len(partition_offset_values)))
offsets.update(extra_partitions)
else:
offsets = dict((TopicAndPartition(topic_name, int(k.decode("utf-8").split(":")[1])), int(v))
for k, v in partition_offset_values.items())
return offsets
(4)接下来就是数据的处理,获取数据的offset并保存offset
if __name__ == "__main__":
sc = SparkContext(appName="test")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 5)
# kafka_params = {"metadata.broker.list": "localhost:9092"}
kafka_params = {"metadata.broker.list": "xxxxx:9092,xxxxx:9092,xxxxx:9092"}
# fromOffset = get_last_committed_offsets("test", "test-id", "stream_kafka_offsets")
fromOffset = get_last_committed_offsets("mytopic", "test-group-2", "stream_kafka_offsets")
# kafkaStream = KafkaUtils.createDirectStream(ssc, ["test"], kafka_params, fromOffsets=fromOffset)
kafkaStream = KafkaUtils.createDirectStream(ssc, ["mytopic"], kafka_params, fromOffsets=fromOffset)
def inner_func(rdd):
rdd.foreach(lambda x: print(x))
save_offsets("mytopic", "test-group-2", rdd.offsetRanges(),"stream_kafka_offsets")
kafkaStream.foreachRDD(inner_func)
ssc.start()
ssc.awaitTermination()
然后运行:
spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.3 test.py
完整代码:
test.py
:
from kafka import SimpleClient
from kafka.structs import OffsetRequestPayload
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pyspark.streaming import StreamingContext
from pyspark import SparkContext
from algo_core.utils.hbase_util import HappyBaseUtil # 自己封装的happybase包
def save_offsets(topic_name, group_id, offset_ranges, hbase_table_name):
happybase_util = HappyBaseUtil()
for offset in offset_ranges:
happybase_util.put(hbase_table_name, topic_name+"_"+group_id, {"offsets:"+str(offset.partition): str(offset.untilOffset)})
def get_last_committed_offsets(topic_name, group_id, hbase_table_name):
# client = SimpleClient('localhost:9092')
client = SimpleClient(["xxxxx:9092","xxxxx:9092","xxxxx:9092"])
# 获取zookeeper中kafka topic的partition
topic_partition_ids = client.get_partition_ids_for_topic(topic_name)
happybase_util = HappyBaseUtil()
# 获取hbase存放的kafka topic的partition
partition_offset_values = happybase_util.get_row(hbase_table_name, row=topic_name+"_"+group_id)
if len(partition_offset_values) == 0:
# 第一次运行处理
partitions = client.topic_partitions[topic_name]
offset_requests = [OffsetRequestPayload(topic_name, p, -1, 1) for p in partitions.keys()]
offsets_responses = client.send_offset_request(offset_requests)
offsets = dict((TopicAndPartition(topic_name, r.partition), r.offsets[0]) for r in offsets_responses)
elif len(partition_offset_values) < len(topic_partition_ids):
# 如果hbase中partition个数小于zookeeper中partition的个数,说明有新增的partition,新增的partition偏移量设为0
offsets = dict((TopicAndPartition(topic_name, int(k.decode("utf-8").split(":")[1])), int(v))
for k, v in partition_offset_values.items())
extra_partitions = dict((TopicAndPartition(topic_name, i), 0)
for i in range(len(topic_partition_ids), len(partition_offset_values)))
offsets.update(extra_partitions)
else:
offsets = dict((TopicAndPartition(topic_name, int(k.decode("utf-8").split(":")[1])), int(v))
for k, v in partition_offset_values.items())
return offsets
if __name__ == "__main__":
sc = SparkContext(appName="test")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 5)
# kafka_params = {"metadata.broker.list": "localhost:9092"}
kafka_params = {"metadata.broker.list": "xxxxx:9092,xxxxx:9092,xxxxx:9092"}
# fromOffset = get_last_committed_offsets("test", "test-id", "stream_kafka_offsets")
fromOffset = get_last_committed_offsets("mytopic", "test-group-2", "stream_kafka_offsets")
# kafkaStream = KafkaUtils.createDirectStream(ssc, ["test"], kafka_params, fromOffsets=fromOffset)
kafkaStream = KafkaUtils.createDirectStream(ssc, ["mytopic"], kafka_params, fromOffsets=fromOffset)
def inner_func(rdd):
rdd.foreach(lambda x: print(x))
save_offsets("mytopic", "test-group-2", rdd.offsetRanges(),"stream_kafka_offsets")
kafkaStream.foreachRDD(inner_func)
ssc.start()
ssc.awaitTermination()