org.apache.spark
spark-streaming-kafka-0-8_2.11
2.1.1
package com.spark.streaming.day01.kafka
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Author
* Date 2020/1/13 16:40
*/
object WordCount1 {
def main(args: Array[String]): Unit = {
// 获得ssc
val conf: SparkConf = new SparkConf().setAppName("a").setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(3))
// kafka参数
val params: Map[String, String] = Map[String, String](
"group.id" -> "8888",
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092")
// 从kafka读取数据,用直连的方式,传入参数和主题即可,泛型为kafka读取的KV类型
val sourceStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
params,
Set("s0830"))
sourceStream
.map {
case (_, v) => v
}
.flatMap(_.split("\\W+"))
.map((_, 1))
.reduceByKey(_ + _)
.print(1000)
ssc.start()
ssc.awaitTermination()
}
}
package com.spark.streaming.day02.kafka
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WordCount {
// 第一次创建ssc时调用
def creatSSC()={
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("wd")
val ssc = new StreamingContext(conf,Seconds(3))
// 设置检查点(比第一张方法多的)
ssc.checkpoint("./ck1")
val params: Map[String, String] = Map[String,String]("group.id"->"0830","bootstrap.servers"->"hadoop102:9092,hadoop103:9092,hadoop104:9092")
KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,params,Set("s0830"))
.flatMap{
case(_,v) => v.split(" ").map((_,1))
}.reduceByKey(_+_)
.print()
ssc
}
def main(args: Array[String]): Unit = {
//从给定checkpoint路径创建ssc,如果给点目录不存在,那么从 creatSSC方法中创建新的ssc
val ssc = StreamingContext.getActiveOrCreate("./ck1", creatSSC)
ssc.start()
ssc.awaitTermination()
}
}
package com.spark.streaming.day02.kafka
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WordCount2 {
//kafka参数
val params: Map[String, String] = Map[String, String](
"group.id" -> "0830",
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092")
val topics = Set("s0830")
//维护offset 和读取的时候需要
val groupId = "0830"
//获取cluster
private val cluster = new KafkaCluster(params)
//读取offset调用的方法
def readOffsets() ={
//获取对应主题的主题和分区,类型Either,right为有,left为错误,表示传入主题没有
val topicAndPartitionEither: Either[Err, Set[TopicAndPartition]] = cluster.getPartitions(topics)
var resultMap = Map[TopicAndPartition,Long]()
topicAndPartitionEither match {
case Right(topicAndPartitionSet) =>
//如果主题存在,获取每个分区的offset,right为有,left为没有,为第一次消费
val topicAndPartitonAndOffsetsEither: Either[Err, Map[TopicAndPartition, Long]] = cluster.getConsumerOffsets(groupId, topicAndPartitionSet)
topicAndPartitonAndOffsetsEither match {
// 表示不是第一次消费
case Right(map) =>
resultMap ++= map
// 表示是第一次消费. 把每个分区的offset设置为0
case _ =>
topicAndPartitionSet.foreach(topicAndPartition => {
resultMap += topicAndPartition -> 0L
})
}
case _ =>
}
resultMap
}
//保存每个分区的offset
def saveOffsets(sourceStream:InputDStream[String]): Unit ={
//从直连得到的stream中获取每批次的offset
sourceStream.foreachRDD(rdd => {
//转换为类型
val hasOffsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
//得到这一个批次的offset的起始和末尾值
val offsetRanges: Array[OffsetRange] = hasOffsetRanges.offsetRanges
var map = Map[TopicAndPartition,Long]()
offsetRanges.foreach(offsetRange => {
//得到TopicAndPartition
val key: TopicAndPartition = offsetRange.topicAndPartition()
//得到对应分区的ffset
val value: Long = offsetRange.untilOffset
//添加到map中
map += key->value
})
//保存到kafka中
cluster.setConsumerOffsets(groupId,map)
})
}
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("local[2]").setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(3))
ssc.checkpoint("./ck2")
//kafka直连获取dstream
val sourceStream: InputDStream[String]=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
ssc,
params,
//传入offset, 类型为Map[TopicAndPartition,Long]()
readOffsets(),
(handler: MessageAndMetadata[String, String]) => handler.message()
)
// 可以手动维护 offset 使用低阶api
sourceStream
.flatMap(_.split("\\W+"))
.map((_, 1))
.reduceByKey(_ + _)
.print(10000)
//数据处理后保存offset,Map[TopicAndPartition,Long]()类型,注意只能保证最少一次,加入处理数据完了,保存offset时挂掉,会重读拉去数据,不然就要实现事务
saveOffsets(sourceStream)
ssc.start()
ssc.awaitTermination()
}
}
org.apache.spark
spark-streaming-kafka-0-10_2.11
2.4.0
package com.atguigu.realtime.util
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
/**
* Author 8888
* Date 2020/1/15 16:36
*/
object MyKafkaUtil {
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "bigdata",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean)
)
def getKafkaStream(ssc: StreamingContext, topic:String, otherTopics: String*): DStream[String] = {
KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](otherTopics :+ topic, kafkaParams)
).map(record => record.value())
}
}
package com.qzpoint.streaming.aa
import java.lang
import java.sql.ResultSet
import com.atguigu.qzpoint.util.{DataSourceUtil, QueryCallback, SqlProxy}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object RegisterStreaming {
private val groupid = "register_group_test"
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "atguigu")
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
//对于kafka的每个分区,每秒钟的读取最大数量
.set("spark.streaming.kafka.maxRatePerPartition", "100")
.setMaster("local[*]")
// 1. 创建SparkStreaming的入口对象: StreamingContext 参数2: 表示事件间隔
// 内部会创建 SparkContext
val ssc = new StreamingContext(conf, Seconds(3))
// 根据ssc,创建sc
val sparkContext: SparkContext = ssc.sparkContext
// 消费的主题,可以是多个 直连的时候需要用
val topics = Array("register_topic")
// kafka的参数,直连的时候需要用
val kafkaMap: Map[String, Object] = Map[String, Object](
// kafka 地址
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
// 反序列化器
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
// 如果是第一次消费,从头开始消费
"auto.offset.reset" -> "earliest",
//如果是true,则这个消费者的偏移量会在后台自动提交,但是kafka宕机容易丢失数据
//如果是false,则需要手动维护kafka偏移量
"enable.auto.commit" -> (false: lang.Boolean)
)
//sparkStreaming对有状态的数据操作,需要设定检查点目录,然后将状态保存到检查点中
ssc.checkpoint("/user/atguigu/sparkstreaming/checkpoint")
//查询mysql中是否有偏移量
val sqlProxy = new SqlProxy()
// 对应主题分区的offset,直连的时候需要用
val offsetMap = new mutable.HashMap[TopicPartition, Long]()
// 从德鲁伊连接池获取jdbc连接
val client = DataSourceUtil.getConnection
// 获取查询语句结果并且对结果进行封装到offsetMap
try {
// 回调函数
sqlProxy.executeQuery(client, "select * from `offset_manager` where groupid=?", Array(groupid), new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val model = new TopicPartition(rs.getString(2), rs.getInt(3))
val offset = rs.getLong(4)
offsetMap.put(model, offset)
}
rs.close() //关闭执行语句
}
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
// 关闭执行语句、关闭连接,关闭预编译
sqlProxy.shutdown(client)
}
//设置kafka消费数据的参数 判断本地是否有偏移量 有则根据偏移量继续消费 无则重新消费
val stream: InputDStream[ConsumerRecord[String, String]] = if (offsetMap.isEmpty) {
// 从头消费
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap))
// 从offset位置消费
} else {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap, offsetMap))
}
//处理完 业务逻辑后 (因为代码从上到下执行)手动提交offset维护到本地 mysql中,stream是从kafka获取的流,这个里面的RDD有我们需要的数据
stream.foreachRDD(rdd => {
val sqlProxy = new SqlProxy()
val client = DataSourceUtil.getConnection
try {
// 转换RDD
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 获取主题、分区和偏移量
for (or <- offsetRanges) {
sqlProxy.executeUpdate(client, "replace into `offset_manager` (groupid,topic,`partition`,untilOffset) values(?,?,?,?)",
Array(groupid, or.topic, or.partition.toString, or.untilOffset))
}
} catch {
case e: Exception => e.printStackTrace()
// 关闭连接、预编译、执行语句
} finally {
sqlProxy.shutdown(client)
}
})
// 开启任务
ssc.start()
// 一直执行
ssc.awaitTermination()
}
}
没有克服不了的困难,只有畏惧的心。 生活之所以耀眼,是因为磨难与辉煌会同时出现。所以,别畏惧暂时的困顿,即使无人鼓掌,也要全情投入,优雅坚持。请相信:不管多险峻的高山,总会给勇敢的人留一条攀登的路。只要你肯迈步,路就会在你脚下延伸。