一 Spark Streaming
1 介绍
参考资料
2 spark streaming第一例
2.1 导入依赖
org.apache.spark
spark-streaming_2.11
2.2.0
2.2 Spark Streaming的第一个代码:单词统计
package cn.qphone.spark.streaming.day1
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Demo1_Streaming {
def main(args: Array[String]): Unit = {
//1. 参数过滤
if (args == null || args.length != 2) {
println(
"""
|Useage :
|""".stripMargin)
System.exit(-1)
}
var Array(hostname, port) = args
//2. 获取核心类
val streamContext: StreamingContext = new StreamingContext(new SparkConf()
.setAppName("Demo1_Streaming").setMaster("local[*]"), Seconds(5))
//3. 业务:单词统计
val lines:ReceiverInputDStream[String] = streamContext.socketTextStream(hostname, port.toInt)
val retDStream:DStream[(String, Int)] = lines.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_+_)
retDStream.print
//4. 为了执行的流式计算,必须要调用start来启动
streamContext.start()
//5. 为了不至于start启动程序结束,必须要调用awaitTermination方法等待程序业务完成之后调用stop方法结束程序,或者异常
streamContext.awaitTermination()
}
}
2.3 配置
2.4 安装web服务器
yum -y install nc
nc -lk qphone01 4444
2.5 Receiver
Receiver,顾名思义,就是数据的接收者,这里把资源分成了两部分,一部分用来接收数据,一部分用来处理数据。Receiver接收到的数据,说白了就是一个个的batch数据,是RDD,存储在Executor内存。Receiver就是Executor内存中的一部分。
不是所有的streaming作业都需要有Receiver。
通过下图,来阐述基于Receiver的程序执行的流程
2.6 Spark Streaming和HDFS
package cn.qphone.spark.streaming.day1
import cn.qphone.spark.utils.LoggerTrait
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
object Demo2_HDFS extends LoggerTrait{
def main(args: Array[String]): Unit = {
//2. 获取核心类
val streamContext: StreamingContext = new StreamingContext(new SparkConf()
.setAppName("Demo2_HDFS").setMaster("local[*]"), Seconds(5))
//3. 业务:单词统计
val lines: DStream[String] = streamContext.textFileStream("hdfs://192.168.49.111:9000/data")
val retDStream:DStream[(String, Int)] = lines.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_+_)
retDStream.print
//4. 为了执行的流式计算,必须要调用start来启动
streamContext.start()
//5. 为了不至于start启动程序结束,必须要调用awaitTermination方法等待程序业务完成之后调用stop方法结束程序,或者异常
streamContext.awaitTermination()
}
}
2.7 Spark Streaming和Kafka
2.7.1 导入依赖
org.apache.spark
spark-streaming-kafka-0-8_2.11
2.2.0
2.7.2 整合两种版本的区别
2.7.3 ReceicerStream
2.7.3.1 SparkUtils
package cn.qphone.spark.utils
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkUtils {
val DEFAULT_STREAMING_INTERVAL:Int = 5
/*
* spark core
* */
def getContext(appName:String):SparkContext = getContext(appName, "local[*]")
def getContext(appName:String, masterUrl:String):SparkContext = {
val sc = new SparkContext(
new SparkConf().setMaster(masterUrl).setAppName(appName)
)
sc
}
/*
* spark sql
* */
def getDefaultSession(appName:String):SparkSession = getSessionWithNotHiveSupport(appName, "local[*]")
def getSession(appName:String, isSupportHive:Boolean):SparkSession = {
if (isSupportHive) getSessionWithHiveSupport(appName, "local[*]")
else getSessionWithNotHiveSupport(appName, "local[*]")
}
def getSessionWithHiveSupport(appName:String, masterUrl:String):SparkSession = SparkSession.builder()
.appName(appName).master(masterUrl).enableHiveSupport().getOrCreate() // 支持hive
def getSessionWithNotHiveSupport(appName:String, masterUrl:String):SparkSession = SparkSession.builder().appName(appName).master(masterUrl).getOrCreate()
/*
* spark streaming
* */
def getDefaultStreamingContext(appName:String):StreamingContext = getStreamingContext(appName, "local[*]", DEFAULT_STREAMING_INTERVAL)
def getStreamingContext(appName:String, master:String, interval:Int):StreamingContext = new StreamingContext(new SparkConf().setAppName(appName).setMaster(master), Seconds(interval))
def stop(sparkContext: SparkContext) = {
if (null != sparkContext && !sparkContext.isStopped) {
sparkContext.stop()
}
}
def stop(sparkSession: SparkSession) = {
if (null != sparkSession && !sparkSession.sparkContext.isStopped) {
sparkSession.stop()
}
}
def stop(streamingContext:StreamingContext) = {
if (null != streamingContext && !streamingContext.sparkContext.isStopped) {
streamingContext.stop()
}
}
}
2.7.3.2 Demo3_Receiver
package cn.qphone.spark.streaming.day1
import cn.qphone.spark.utils.{LoggerTrait, SparkUtils}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
object Demo3_Kafka_Receiver extends LoggerTrait{
def main(args: Array[String]): Unit = {
//1. 获取到context
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo3_Kafka_Receiver")
//2.准备参数
val zkQuorum:String = "qphone01,qphone02,qphone03/kafka"
val groupId = "streaming_receiver_hz2002"
val topics = Map(
"receiver_topic" -> 1
)
//3. 通过配置参数获取到kafka中的数据离散流
val msgDStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics, StorageLevel.MEMORY_AND_DISK_SER_2)
//4. 业务
msgDStream.print
//5. 先启动
ssc.start()
//6. 监控
ssc.awaitTermination()
}
}
2.7.3.3 其他步骤
##1. 启动kafka
##2. 创建指定的主题
##3. 启动代码
##4. 利用生产者进程生产数据
2.7.3.4 关于recevicer为何被弃用
##1.
这种方式使用Receiver来获取数据。Receiver是使用Kafka的高层次Consumer API来实现的。receiver从Kafka中获取的数据都是存储在Spark Executor的内存中的,然后Spark Streaming启动的job会去处理那些数据。
然而,在默认的配置下,这种方式可能会因为底层的失败而丢失数据.
如果要启用高可靠机制,让数据零丢失,就必须启用Spark Streaming的预写日志机制(Write Ahead Log,WAL)。该机制会同步地将接收到的Kafka数据写入分布式文件系统(比如HDFS)上的预写日志中。所以,即使底层节点出现了失败,也可以使用预写日志中的数据进行恢复。
##2.
需要注意的地方
1. Kafka的topic分区和Spark Streaming中生成的RDD分区没有关系。 在KafkaUtils.createStream中增加分区数量只会增加单个receiver的线程数,不会增加Spark的并行度
2. 可以创建多个的Kafka的输入DStream, 使用不同的group和topic, 使用多个receiver并行接收数据。
3. 如果启用了HDFS等有容错的存储系统,并且启用了写入日志,则接收到的数据已经被复制到日志中。因此,输入流的存储级别设置StorageLevel.MEMORY_AND_DISK_SER(即使用KafkaUtils.createStream(...,StorageLevel.MEMORY_AND_DISK_SER))的存储级别。
##3. 数据会丢失原因
2.7.4 DirectStream
package cn.qphone.spark.streaming.day2
import java.util.Properties
import cn.qphone.spark.utils.{CommonScalaUtils, CommonUtils, SparkUtils}
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import scala.collection.JavaConversions
object Demo1_Kafka_Direct{
def main(args: Array[String]): Unit = {
//1. 获取到context
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo4_Kafka_Direct")
//2.准备参数
val properties = new Properties()
properties.load(Demo1_Kafka_Direct.getClass.getClassLoader.getResourceAsStream("kafka.properties"))
// val kafkaParams = JavaConversions.mapAsScalaMap(CommonUtils.toMap(properties)).toMap
val kafkaParams = CommonScalaUtils.toMap(properties)
val topics = Set(
"hzbigdata2002"
)
//3. 通过配置参数获取到kafka中的数据离散流
val msgDStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
//4. 业务
msgDStream.print
msgDStream.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
val offsetRDD = rdd.asInstanceOf[HasOffsetRanges]
val offsetRanges = offsetRDD.offsetRanges
for(offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val fromOffset = offsetRange.fromOffset
val untilOffset = offsetRange.untilOffset
val rddcount = rdd.count()
println(s"topic:${topic}\tpartition:${partition}\tstart:${fromOffset}\tend:${untilOffset}\tcount:${rddcount}")
}
rdd.count()
}
})
//5. 先启动
ssc.start()
//6. 监控
ssc.awaitTermination()
}
}
bootstrap.servers=qphone01:9092,qphone02:9092,qphone03:9092
group.id=a_streaming
auto.offset.reset=smallest
public class CommonUtils {
public static void main(String[] args) throws IOException {
Properties properties = new Properties();
properties.load(CommonUtils.class.getClassLoader().getResourceAsStream("kafka.properties"));
Map map = toMap(properties);
System.out.println(map);
}
public static Map toMap(Properties properties) {
Map map = new HashMap<>();
Set> entries = properties.entrySet();
for (Map.Entry entry : entries) {
map.put((String) entry.getKey(), (String) entry.getValue());
}
return map;
}
}
object CommonScalaUtils {
def toMap(properties: Properties):immutable.Map[String, String] = { // scala.collection.immutable.Map
val entries: util.Set[Map.Entry[AnyRef, AnyRef]] = properties.entrySet() // 获取到properties的kv,kv是被存放到java.util.Set
val set: mutable.Set[Map.Entry[AnyRef, AnyRef]] = JavaConversions.asScalaSet(entries) // 将java.util.Set转换为scala.collection.mutable.Set
var map = mutable.Map[String, String]() // scala.collection.mutable.Map
set.foreach(entry => map.put(entry.getKey.asInstanceOf[String], entry.getValue.asInstanceOf[String]))
map.toMap
}
}
2.8 偏移量保存
2.8.1 使用zk来保存具体的偏移量
package cn.qphone.spark.utils;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.ExponentialBackoffRetry;
public class ZKCuratorUtils {
private static final String ZK_CONNECTIONS = "qphone01,qphone02,qphone03";
private static final int BASE_SLEEP_TIME_MS = 1000;
private static final int MAX_RETRIES = 3;
public static CuratorFramework getClient() {
CuratorFramework client = CuratorFrameworkFactory
.newClient(ZK_CONNECTIONS, new ExponentialBackoffRetry(BASE_SLEEP_TIME_MS, MAX_RETRIES));
client.start();
return client;
}
}
package cn.qphone.spark.streaming.day2
import java.util.Properties
import cn.qphone.spark.utils.{CommonScalaUtils, LoggerTrait, SparkUtils, ZKCuratorUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.zookeeper.data.Stat
import scala.collection._
import scala.collection.JavaConversions
object Demo2_Offset_ZK extends LoggerTrait{
var client = ZKCuratorUtils.getClient
def main(args: Array[String]): Unit = {
//1. 获取到context
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo2_Offset_ZK")
//2.准备参数
val properties = new Properties()
properties.load(Demo2_Offset_ZK.getClass.getClassLoader.getResourceAsStream("kafka.properties"))
val kafkaParams: Predef.Map[String, String] = CommonScalaUtils.toMap(properties)
val topics: Predef.Set[String] = "zk_offset".split(",").toSet
//3. 通过配置参数获取到kafka中的数据离散流
val messageStream: InputDStream[(String, String)] = createMsg(ssc, kafkaParams, topics)
//4.遍历
messageStream.foreachRDD((rdd, btime) => {
if (!rdd.isEmpty()) {
val offsetRDD = rdd.asInstanceOf[HasOffsetRanges]
val offsetRanges: Array[OffsetRange] = offsetRDD.offsetRanges
storeOffset(offsetRanges, kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 创建消息流
* 偏移量保存在zk中
* 如果是第一次读取到这个内容,就从头开始读取,并且在zk上创建目录
* 如果不是第一次,就从指定的偏移量开始读取,从zk的指定目录读取偏移量
*/
def createMsg(ssc:StreamingContext, kafkaParams:Predef.Map[String, String], topics:Predef.Set[String]):InputDStream[(String, String)] = {
//1. 读取偏移量(TopicAndPartion,offset)
val offsets:Predef.Map[TopicAndPartition, Long] = getOffset(topics, kafkaParams("group.id"))
//2.如果没有偏移量就代表第一次读取
var msgDStream: InputDStream[(String, String)] = null
if(offsets.isEmpty) {
msgDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}else { //否则就不是第一次读取
val messagehandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
msgDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, offsets, messagehandler)
}
msgDStream
}
/**
* 获取到指定主题的各个分区的偏移量
*/
def getOffset(topics:Set[String], groupId:String):Predef.Map[TopicAndPartition, Long] = {
//0. 创建一个可变map
val map = mutable.Map[TopicAndPartition, Long]()
//1. 遍历主题
topics.foreach(topic => {
//2. 通过zk查询这个主题目录是否存在,如果不存在说明之前没有创建,说明是第一次,第一次就需要先创建这个目录
val path = s"/${topic}/$groupId"
checkExist(path)
//3. 读取zk的path目录下的子目录,因为这个子目录全是保存的分区目录
JavaConversions.asScalaBuffer(client.getChildren.forPath(path)).foreach(partition => {
//4. 读取偏移量
val fullPath = s"$path/$partition"
val offset = new String(client.getData.forPath(fullPath)).toLong
val tap:TopicAndPartition = new TopicAndPartition(topic, partition.toInt)
map.put(tap, offset)
})
})
map.toMap
}
/**
* 校验此路径是否存在,如果不存在就创建他,并返回其偏移量
*/
def checkExist(path:String):Unit = {
//1. 判断此路径是否存在
val stat: Stat = client.checkExists().forPath(path)
if (stat == null) { // 2.说明不存在
//3. 创建路径
client.create().creatingParentsIfNeeded().forPath(path)
}
}
/**
* 存储偏移量
*/
def storeOffset(offsetRanges: Array[OffsetRange], group_id:String) = {
for(offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val fromOffset = offsetRange.fromOffset
val untilOffset = offsetRange.untilOffset
val path = s"/$topic/$group_id/$partition"
checkExist(path) // 创建分区目录
client.setData().forPath(path, untilOffset.toString.getBytes())
//存储偏移量
println(s"topic:${topic}\tpartition:${partition}\tstart:${fromOffset}\tend:${untilOffset}")
}
}
}
2.8.2 使用redis保存偏移量
package cn.qphone.spark.utils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
public class JedisUtils {
private static final String DEFAULT_HOST = "192.168.49.111";
private static final int DEPAULT_PORT = 6379;
private static JedisPool jedisPool = new JedisPool(new JedisPoolConfig(), DEFAULT_HOST, DEPAULT_PORT);
public static JedisPool getDefaultPool() {
return jedisPool;
}
public static JedisPool getPool(String host, int port) {
jedisPool = new JedisPool(new JedisPoolConfig(), host, port);
return jedisPool;
}
public static Jedis getJedisWithDefaultPool() {
Jedis jedis = jedisPool.getResource();
jedis.auth("123456");
return jedis;
}
public static Jedis getJedisWithPool(JedisPool pool) {
if (pool != null) return pool.getResource();
return null;
}
public static void returnJedisWithDefaultPool(Jedis jedis) {
if(jedis != null) jedisPool.returnResource(jedis);
}
public static void returnJedisWithPool(Jedis jedis, JedisPool pool) {
if (jedis != null && pool != null) pool.returnResource(jedis);
}
}
package cn.qphone.spark.streaming.day2
import java.util
import java.util.Properties
import cn.qphone.spark.utils.{CommonScalaUtils, JedisUtils, LoggerTrait, SparkUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import scala.collection.{JavaConversions, Set, mutable}
object Demo3_Offset_Redis extends LoggerTrait{
def main(args: Array[String]): Unit = {
//一 准备基本参数StreamingContext,kafkaParams,topics
//1. 获取到context
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo3_Offset_Redis")
//2.准备参数
val properties = new Properties()
properties.load(Demo2_Offset_ZK.getClass.getClassLoader.getResourceAsStream("kafka.properties"))
val kafkaParams: Predef.Map[String, String] = CommonScalaUtils.toMap(properties)
val topics: Predef.Set[String] = "redis_offset".split(",").toSet
//二 创建离散流:如果没有offset怎么办,有的话怎么办
//3. 通过配置参数获取到kafka中的数据离散流
val messageStream: InputDStream[(String, String)] = createMsg(ssc, kafkaParams, topics)
//4.遍历
messageStream.foreachRDD((rdd, btime) => {
if (!rdd.isEmpty()) {
val offsetRDD = rdd.asInstanceOf[HasOffsetRanges]
val offsetRanges: Array[OffsetRange] = offsetRDD.offsetRanges
storeOffset(offsetRanges, kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 创建消息流
* 偏移量保存在redis中
* 如果是第一次读取到这个内容,就从头开始读取,并且在redis上创建目录
* 如果不是第一次,就从指定的偏移量开始读取,从redis的指定key读取偏移量
*/
def createMsg(ssc:StreamingContext, kafkaParams:Predef.Map[String, String], topics:Predef.Set[String]):InputDStream[(String, String)] = {
//1. 读取偏移量(TopicAndPartion,offset)
val offsets:Predef.Map[TopicAndPartition, Long] = getOffset(topics, kafkaParams("group.id"))
//2.如果没有偏移量就代表第一次读取
var msgDStream: InputDStream[(String, String)] = null
if(offsets.isEmpty) {
msgDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}else { //否则就不是第一次读取
val messagehandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
msgDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, offsets, messagehandler)
}
msgDStream
}
/**
* 获取到指定主题的各个分区的偏移量
* 127.0.0.1:6379> HSET topic groupid 0_1
* topic_groupid partition offset
* key field value
*
*/
def getOffset(topics:Set[String], groupId:String):Predef.Map[TopicAndPartition, Long] = {
//0. 创建一个可变map
val map = mutable.Map[TopicAndPartition, Long]()
//1. 遍历主题
topics.foreach(topic => {
//2. 通过redis查询这个主题目录是否存在,如果不存在说明之前没有创建,说明是第一次,第一次就需要先创建这个目录
val jedis = JedisUtils.getJedisWithDefaultPool
val key:String = s"${topic}_${groupId}"
val partitionSet: util.Set[String] = jedis.keys(key)
val fields: Predef.Set[String] = JavaConversions.asScalaSet(partitionSet).toSet
//3. 读取zk的path目录下的子目录,因为这个子目录全是保存的分区目录
if(!fields.isEmpty) {
fields.foreach(partition => {
val offset: Long = jedis.hget(key, partition).toLong
val top:TopicAndPartition = new TopicAndPartition(topic, partition.toInt)
map.put(top, offset)
})
}
JedisUtils.returnJedisWithDefaultPool(jedis)
})
map.toMap
}
/**
* 存储偏移量
*/
def storeOffset(offsetRanges: Array[OffsetRange], group_id:String) = {
for(offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val fromOffset = offsetRange.fromOffset
val untilOffset = offsetRange.untilOffset
val jedis = JedisUtils.getJedisWithDefaultPool
val key:String = s"${topic}_${group_id}"
jedis.hset(key, partition.toString, untilOffset.toString)
//存储偏移量
println(s"topic:${topic}\tpartition:${partition}\tstart:${fromOffset}\tend:${untilOffset}")
JedisUtils.returnJedisWithDefaultPool(jedis)
}
}
}
redis.clients
jedis
2.1.0
2.8.3 保存偏移量到hbase
作业
2.9 幂等
2.9.1 介绍
为了实现结果输出的一次语义,将数据保存到外部数据存储的输出操作必须是幂等的,或者是保存结果和偏移量的原子转换
幂等(idempotent、idempotence)是一个数学与计算机学概念,常见于抽象代数中。
在编程中一个幂等操作的特点是其任意多次执行所产生的影响均与一次执行的影响相同。幂等函数,或幂等方法,是指可以使用相同参数重复执行,并能获得相同结果的函数。这些函数不会影响系统状态,也不用担心重复执行会对系统造成改变。例如,“setTrue()”函数就是一个幂等函数,无论多次执行,其结果都是一样的.更复杂的操作幂等保证是利用唯一交易号(流水号)实现.
f(f(x)) = f(x)
2.9.2 测试
## 需求:将kafka中的消息保存到mysql数据库,将偏移量保存到zk
package cn.qphone.spark.utils
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaUtils, OffsetRange}
import org.apache.zookeeper.data.Stat
import scala.collection.{JavaConversions, _}
object KafkaTools {
var client = ZKCuratorUtils.getClient
/**
* 创建消息流
* 偏移量保存在zk中
* 如果是第一次读取到这个内容,就从头开始读取,并且在zk上创建目录
* 如果不是第一次,就从指定的偏移量开始读取,从zk的指定目录读取偏移量
*/
def createMsg(ssc:StreamingContext, kafkaParams:Predef.Map[String, String], topics:Predef.Set[String]):InputDStream[(String, String)] = {
//1. 读取偏移量(TopicAndPartion,offset)
val offsets:Predef.Map[TopicAndPartition, Long] = getOffset(topics, kafkaParams("group.id"))
//2.如果没有偏移量就代表第一次读取
var msgDStream: InputDStream[(String, String)] = null
if(offsets.isEmpty) {
msgDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}else { //否则就不是第一次读取
val messagehandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
msgDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, offsets, messagehandler)
}
msgDStream
}
/**
* 获取到指定主题的各个分区的偏移量
*/
def getOffset(topics:Set[String], groupId:String):Predef.Map[TopicAndPartition, Long] = {
//0. 创建一个可变map
val map = mutable.Map[TopicAndPartition, Long]()
//1. 遍历主题
topics.foreach(topic => {
//2. 通过zk查询这个主题目录是否存在,如果不存在说明之前没有创建,说明是第一次,第一次就需要先创建这个目录
val path = s"/${topic}/$groupId"
checkExist(path)
//3. 读取zk的path目录下的子目录,因为这个子目录全是保存的分区目录
JavaConversions.asScalaBuffer(client.getChildren.forPath(path)).foreach(partition => {
//4. 读取偏移量
val fullPath = s"$path/$partition"
val offset = new String(client.getData.forPath(fullPath)).toLong
val tap:TopicAndPartition = new TopicAndPartition(topic, partition.toInt)
map.put(tap, offset)
})
})
map.toMap
}
/**
* 校验此路径是否存在,如果不存在就创建他,并返回其偏移量
*/
def checkExist(path:String):Unit = {
//1. 判断此路径是否存在
val stat: Stat = client.checkExists().forPath(path)
if (stat == null) { // 2.说明不存在
//3. 创建路径
client.create().creatingParentsIfNeeded().forPath(path)
}
}
/**
* 存储偏移量
*/
def storeOffset(offsetRanges: Array[OffsetRange], group_id:String) = {
for(offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val fromOffset = offsetRange.fromOffset
val untilOffset = offsetRange.untilOffset
val path = s"/$topic/$group_id/$partition"
checkExist(path) // 创建分区目录
client.setData().forPath(path, untilOffset.toString.getBytes())
//存储偏移量
println(s"topic:${topic}\tpartition:${partition}\tstart:${fromOffset}\tend:${untilOffset}")
}
}
}
kafka-console-producer.sh \
--topic mytopic1 \
--broker-list qphone01:9092,qphone02:9092,qphone03:9092
tip:
lixi,1 rocklee,2 lee,3
package cn.qphone.spark.streaming.day2
import java.sql.{Connection, DriverManager, PreparedStatement}
import java.util.Properties
import cn.qphone.spark.streaming.day2.Demo2_Offset_ZK.{createMsg, storeOffset}
import cn.qphone.spark.utils.{CommonScalaUtils, KafkaTools, SparkUtils}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, OffsetRange}
/**
*
*
* 1. 创建测试的mysql数据库
* create database test;
* 2. 建表
* create table myorders(name varchar(20), orderid varchar(100) primary key);
* 3. 新建topic: mytopic1
* kafka-topics.sh --zookeeper qphone01,qphone02,qphone03/kafka --create --topic mytopic1 --partitions 3 --replication-factor 1
* 4. 往mytopic1发送数据, 数据格式为 "字符,数字" 比如 abc,3
**/
object Demo4_Idempotent {
def main(args: Array[String]): Unit = {
//1. 获取到context
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo4_Idempotent")
//2.准备参数
val properties = new Properties()
properties.load(Demo2_Offset_ZK.getClass.getClassLoader.getResourceAsStream("kafka.properties"))
val kafkaParams: Predef.Map[String, String] = CommonScalaUtils.toMap(properties)
val topics: Predef.Set[String] = "mytopic1".split(",").toSet
//3. 通过配置参数获取到kafka中的数据离散流
val messageStream: InputDStream[(String, String)] = KafkaTools.createMsg(ssc, kafkaParams, topics)
//4.遍历
messageStream.foreachRDD((rdd, btime) => {
if (!rdd.isEmpty()) {
val offsetRDD = rdd.asInstanceOf[HasOffsetRanges]
val offsetRanges: Array[OffsetRange] = offsetRDD.offsetRanges
//4. 保存数据
rdd.map(kv => {
println(kv + "---->")
println(kv._2 + "=====>")
kv._2
}).foreachPartition(partition =>{
val connection: Connection = DriverManager.getConnection("jdbc:mysql://192.168.49.111:3306/test", "root", "123456")
val sql = "insert into myorders(orderid, name) values (?, ?) ON DUPLICATE KEY UPDATE orderid=?"
val statement: PreparedStatement = connection.prepareStatement(sql)
partition.foreach(msg => {
val name:String = msg.split(",")(0)
val id:String = msg.split(",")(1)
statement.setString(1, id)
statement.setString(2, name)
statement.setString(3, id)
statement.execute()
})
connection.close()
})
//5. 保存偏移量
KafkaTools.storeOffset(offsetRanges, kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
}
2.10 原子性操作
2.10.1 建表语句
1. 创建测试的mysql数据库
create database test;
2. 新建topic: mytopic1
kafka-topics.sh --zookeeper qphone01,qphone02,qphone03/kafka --create --topic mytopic1 --partitions 3 --replication-factor 1
3. 建表
create table `test`.mytopic(topic varchar(200), groupid varchar(20), partid int, offset bigint);
create table `test`.mydata(data varchar(200), id int);
初始化表:
insert into mytopic(topic, groupid, partid, offset) values('mytopic1','hzbigdata2002',0,0);
insert into mytopic(topic, groupid, partid, offset) values('mytopic1','hzbigdata2002',1,0);
insert into mytopic(topic, groupid, partid, offset) values('mytopic1','hzbigdata2002',2,0);
4. 往mytopic1发送数据, 数据格式为 "字符,数字" 比如 abc,3
5. 在pom文件加入依赖
org.scalikejdbc
scalikejdbc_2.11
3.2.0
2.10.2 代码
package cn.qphone.spark.streaming.day3
import java.util.Properties
import cn.qphone.spark.streaming.day2.Demo2_Offset_ZK
import cn.qphone.spark.utils.{CommonScalaUtils, LoggerTrait, SparkUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.TaskContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import scalikejdbc.{ConnectionPool, DB}
import scalikejdbc._
object Demo1_Atometic extends LoggerTrait{
def main(args: Array[String]): Unit = {
//1. 获取到context
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo1_Atometic")
//2.准备参数
val properties = new Properties()
properties.load(Demo2_Offset_ZK.getClass.getClassLoader.getResourceAsStream("kafka.properties"))
val kafkaParams: Predef.Map[String, String] = CommonScalaUtils.toMap(properties)
val topics: Predef.Set[String] = "mytopic1".split(",").toSet
//3. 先要通过jdbc将指定主题的偏移量和分区读取出来
val driver = "com.mysql.jdbc.Driver"
val jdbcUrl = "jdbc:mysql://192.168.49.111:3306/test"
val jdbcUser = "root"
val jdbcPassword = "123456"
val group_id = "hzbigdata2002"
// 注册驱动
Class.forName(driver)
// 设置连接池,不用返回对象,因为在底层,scala的jdbc会给你隐式的创建连接对象
ConnectionPool.singleton(jdbcUrl, jdbcUser, jdbcPassword)
// scala jdbc查询数据库的内容
val fromOffsets = DB.readOnly {
implicit session => sql"select topic, partid, offset from mytopic"
.map { r =>
TopicAndPartition(r.string(1), r.int(2)) -> r.long(3)
}.list.apply().toMap
}
//4. 创建离散流
val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
//5. 写操作
messages.foreachRDD((rdd, btime) => {
if(!rdd.isEmpty()) {
val offsetRDD = rdd.asInstanceOf[HasOffsetRanges]
val offsetRanges: Array[OffsetRange] = offsetRDD.offsetRanges
rdd.foreachPartition(partition => {
//获取到指定分区的偏移量范围
val pOffsetRange: OffsetRange = offsetRanges(TaskContext.getPartitionId())
//6. 开启事务
DB.localTx {implicit session => {
partition.foreach(msg => { // (key, value)
//1. 插入数据
val data = msg._2.split(",")(0)
val id =msg._2.split(",")(1)
val dataResult =
sql"""
insert into mydata(data,id) values (${data},${id})
""".execute().apply()
})
// println(1/0)
val offsetResult =
sql"""
update
mytopic
set
offset = ${pOffsetRange.untilOffset}
where
topic = ${pOffsetRange.topic}
and
partid = ${pOffsetRange.partition}
and
groupid = ${group_id }
""".update.apply()
}}
})
}
})
ssc.start()
ssc.awaitTermination()
}
}
2.11 Spark Streaming常见的转换算子
|
|
函数名称 |
描述 |
map(func) |
对DStream中的各个元素进行func函数参数,返回一个新的DStream |
flatMap(func) |
与map相似,只不过各个输入项可以被输出为零个或者多个输出项 |
filter(func) |
过滤出所有函数func返回值为true的Dstream元素并返回一个新的DStream |
repartition(numPartitions) |
增加或减少DStream中的分区数,从而改变DStream的并行度 |
union(otherDStream) |
将数个DStream合并,返回一个新的DStream |
count() |
通过DStream中的各个RDD中的元素计数,返回只有一个元素的RDD构成DStream |
reduce(func) |
对源DStream中的各个RDD的元素利用func函数进行聚合操作,然后返回只有一个元数的RDD构成的新的DStream |
countByValue() |
对于元素类型为K的DStream,返回一个元数为(K,Long)键值对行的新的DStream,Long对应的值为源DStream中个RDD的key的出现的个数 |
reduceByKey(func, [numTasks]) |
利用func函数对源DStream中key进行聚合操作,然会新的(K, V)的新的DStream |
join(otherDStream) |
|
cogroup(otherDStream) |
|
transform(func) |
|
updateStateByKey(func) |
|
window |
|
2.11.1 transform
可以获取到内部rdd,对其就行转换处理
transform是一个transformation算子,转换算子。
DStream上述提供的所有的transformation操作,都是DStream-2-DStream操作,每一个DStream和RDD的直接操作,而DStream本质上是一系列RDD,所以RDD-2-RDD操作是显然被需要的,所以此时官方api中提供了一个为了达成此操作的算子——transform操作。
其最最最经典的实现就是DStream和rdd的join操作,还有dstream重分区(分区减少,coalsce)。
也就是说transform主要就是用来自定义官方api没有提供的一些操作。
object Demo2_BlackList extends LoggerTrait{
def main(args: Array[String]): Unit = {
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo2_BlackList")
//黑名单RDD
/**
* 110.52.250.126##2016-05-30 17:38:20##GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1##200##1292
*
* 110.52.250.127##2016-05-30 17:38:20##GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1##200##1292
*/
val blacklistRDD:RDD[(String, Boolean)] = ssc.sparkContext.parallelize(List(
("27.19.74.143", true),
("110.52.250.126", true)
))
//接入外部的数据流
val lines:DStream[String] = ssc.socketTextStream("192.168.49.111", 9999)
val ip2OtherDStream:DStream[(String, String)] = lines.map(line => {
val index = line.indexOf("##")
val ip = line.substring(0, index)
val other = line.substring(index + 2)
(ip, other)
})
val filteredDStream: DStream[(String, String)] = ip2OtherDStream.transform(rdd => {
val join = rdd.leftOuterJoin(blacklistRDD)
join.filter {case (ip, (left, right)) => {
!right.isDefined // 取没有定义右边
}}.map{case (ip, (left, right)) => {
(ip, left)
}}
})
filteredDStream.print()
ssc.start()
ssc.awaitTermination()
}
}
2.11.2 updateStateByKey
updateStateByKey(func) 根据于key的前置状态和key的新值,对key进行更新,返回一个新状态的Dstream。
人话:统计截止到目前为止key的状态。统计全局的数据
通过分析,我们需要清楚:在这个操作中需要两个数据,一个是key的前置状态,一个是key的新增(当前批次的数据);还有历史数据(前置状态)得需要存储在磁盘,不应该保存在内存中。
同时key的前置状态可能有可能没有。
package cn.qphone.spark.streaming.day3
import cn.qphone.spark.utils.{LoggerTrait, SparkUtils}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
object Demo3_WordCount extends LoggerTrait{
def main(args: Array[String]): Unit = {
val ssc: StreamingContext = SparkUtils.getDefaultStreamingContext("Demo3_WordCount")
val lines:DStream[String] = ssc.socketTextStream("192.168.49.111", 9999)
ssc.checkpoint("file:/d:/data/chk") // 必须,持久化
val retDStream:DStream[(String, Int)] =
lines.flatMap(_.split("\\s+")).map((_, 1)).updateStateByKey(updateFunc _, 1)
retDStream.print
ssc.start()
ssc.awaitTermination()
}
/**
*
* @param seq 以前的历史的值
* @param option 当前的值
*/
def updateFunc(seq:Seq[Int], option:Option[Int]):Option[Int] = {
println(s"seq : ${seq.mkString(",")}")
println(s"currnt : ${option.getOrElse("empty")}")
Option(seq.sum + option.getOrElse(0))
}
}
2.11.3 window算子
window操作其实就是窗口函数。Spark Streaming提供了滑动窗口操作的支持,从而让我们可以对一个滑动窗口内的数据执行计算操作。每次掉落在窗口内的RDD的数据,会被聚合起来执行计算操作,然后生成RDD,会作为window DStream的一个RDD。比如下图中,每3秒钟的数据会执行一次滑动窗口计算,这3秒内的3个RDD会被聚合起来进行处理,然后过了2秒种,又会对最近的3秒的数据行滑动窗口计算。所以每个滑动窗口操作,都必须指定两个参数:窗口长度,以及滑动间隔。而且这两个参数必须是batch间隔的整数倍。
object Demo4_Window extends LoggerTrait{
def main(args: Array[String]): Unit = {
//1. 入口类
val ssc = SparkUtils.getDefaultStreamingContext("Demo4_Window")
//2. 读取数据
val lines:DStream[String] = ssc.socketTextStream("qphone01", 9999)
//3.
val pairs:DStream[(String, Int)] = lines.flatMap(_.split("\\s+")).map((_, 1))
//4. window : reduceByKey,window
val batchInterval = 2
val ret = pairs.reduceByKeyAndWindow(_+_,
windowDuration = Seconds(batchInterval * 3),
slideDuration = Seconds(batchInterval * 2)
)
ret.print()
ssc.start()
ssc.awaitTermination()
}
}
2.12 spark sql 集合 spark streaming
package cn.qphone.spark.streaming.day3
import cn.qphone.spark.utils.SparkUtils
import org.apache.spark.streaming.dstream.DStream
object Demo5_sparksql_streaming {
def main(args: Array[String]): Unit = {
//1. 入口类
//1.1 spark sql
val spark = SparkUtils.getDefaultSession("Demo5_sparksql_streaming")
//1.2 spark streaming
val ssc = SparkUtils.getStreamingContext(spark.sparkContext, 2)
ssc.checkpoint("file:///d:/data/out")
//2. 读取数据:001 mi mobile
val lines:DStream[String] = ssc.socketTextStream("qphone01", 9999)
//3.切割数据并过滤
val pairs:DStream[(String, Int)] = lines.map(line => {
val fields = line.split("\\s+")
if(fields == null || fields.length != 3) {
("", -1)
}else {
val brand = fields(1)
val category = fields(2)
(s"${category}_${brand}", 1)
}
}).filter(t => t._2 != -1)
//(001_Mi,1)(001_Mi,1)(002_Huawei,1)
//4. 聚合
val usb:DStream[(String, Int)] = pairs.updateStateByKey(updateFunc)
//(001_Mi,2)(002_Huawei,1)
//5. 遍历
usb.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
//5.1 重构数据然后获取到DataFrame
import spark.implicits._
val df = rdd.map { case (cb, cnt) => {
val category = cb.substring(0, cb.indexOf("_"))
val brand = cb.substring(cb.indexOf("_") + 1)
(category, brand, cnt)
}}.toDF("category", "brand", "sales")
//5.2 创建视图然后查询
df.createOrReplaceTempView("tmp_category_brand_sales")
val sql =
"""
|select
|t.category,
|t.brand,
|t.sales,
|t.rank
|from
|(
|select
|category,
|brand,
|sales,
|row_number() over(partition by category order by sales desc) rank
|from
|tmp_category_brand_sales
|) t
|where
|rank < 4
|""".stripMargin
spark.sql(sql).show()
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* seq : 表示相同的key聚合之和每一次的业务操作之后的综合
* option : 当前那次操作的状态
*/
def updateFunc(seq:Seq[Int], option:Option[Int]):Option[Int] = Option(seq.sum + option.getOrElse(0))
}