import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
2017/04/05
cdp
每次都是从kafka的最新偏移量开始消费数据
会有spark意外退出导致丢数据的现象发生
*/
object DealFlowBills1 {
def main(args: Array[String]): Unit = {
//输入参数
val Array(output, topic, kafkaid, group, sec) = args
//spark信息
val conf = new SparkConf().setAppName(“DealFlowBills1”)
val ssc = new StreamingContext(conf, Seconds(sec.toInt))
//kafka参数
val topics = Array(topic)
val kafkaParams = Map[String, Object](
“bootstrap.servers” -> kafkaid,
“key.deserializer” -> classOf[StringDeserializer],
“value.deserializer” -> classOf[StringDeserializer],
“group.id” -> group,
“auto.offset.reset” -> “latest”,
“enable.auto.commit” -> (false: java.lang.Boolean)
)
//创建DStream
val lines = KafkaUtils
.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
.map(.value())
//每一个stream都是一个ConsumerRecord,输出接收行
lines.count().print()
//计算DStream
val result = lines
.filter(.split(",").length == 21)
.map {
mlines =>
val line = mlines.split(",")
(line(3), s" l i n e ( 4 ) , {line(4)}, line(4),{line(2)}")
}
.groupByKey()
.map {
case (k, v) =>
val result = v
.flatMap {
fmlines =>
fmlines.split(",").toList.zipWithIndex
}
.groupBy(_.2)
.map {
case (v1, v2) =>
v2.map(._1)
}
(k, result)
}
//计算结果存hdfs
result.saveAsTextFiles(output + s"/output/" + “010”)
ssc.start()
ssc.awaitTermination()
}
}
package com.cdp
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext, TaskContext}
/**
2017/04/05
cdp
手动控制spark 消费 kafka的偏移度
保证spark在意外退出时,重启程序数据不丢失
/
object DealFlowBills2 {
/* ***************************************************************************************************************
* zookeeper 实例化,方便后面对zk的操作
/
val zk = ZkWork
def main(args: Array[String]): Unit = {
/* ***************************************************************************************************************
* 输入参数
/
val Array(output, topic, broker, group, sec) = args
/* ***************************************************************************************************************
* spark套路
/
val conf = new SparkConf().setAppName(“DealFlowBills2”)
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(sec.toInt))
/* ***************************************************************************************************************
* 准备kafka参数
/
val topics = Array(topic)
val kafkaParams = Map[String, Object](
“bootstrap.servers” -> broker,
“key.deserializer” -> classOf[StringDeserializer],
“value.deserializer” -> classOf[StringDeserializer],
“group.id” -> group,
“auto.offset.reset” -> “latest”,
“enable.auto.commit” -> (false: java.lang.Boolean)
)
/* ***************************************************************************************************************
* 判断zk中是否有保存过该计算的偏移量
* 如果没有保存过,使用不带偏移量的计算,在计算完后保存
* 精髓就在于KafkaUtils.createDirectStream这个地方
* 默认是KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)),不加偏移度参数
* 实在找不到办法,最后啃了下源码。发现可以使用偏移度参数
/
val stream = if (zk.znodeIsExists(s"KaTeX parse error: Expected '}', got 'EOF' at end of input: …znodeDataGet(s"{topic}offset")
val newOffset = Map(new TopicPartition(nor(0).toString, nor(1).toInt) -> nor(2).toLong)//创建以topic,分区为k 偏移度为v的map
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
println(s"[ DealFlowBills2 ] topic ${nor(0).toString}")
println(s"[ DealFlowBills2 ] Partition ${nor(1).toInt}")
println(s"[ DealFlowBills2 ] offset ${nor(2).toLong}")
println(s"[ DealFlowBills2 ] zk中取出来的kafka偏移量★★★ $newOffset")
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams, newOffset))
} else {
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
println(s"[ DealFlowBills2 ] 第一次计算,没有zk偏移量文件")
println(s"[ DealFlowBills2 ] 手动创建一个偏移量文件 t o p i c o f f s e t 默 认 从 0 偏 移 度 开 始 计 算 " ) p r i n t l n ( s " [ D e a l F l o w B i l l s 2 ] − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − " ) z k . z n o d e C r e a t e ( s " {topic}offset 默认从0偏移度开始计算") println(s"[ DealFlowBills2 ] --------------------------------------------------------------------") zk.znodeCreate(s" topicoffset默认从0偏移度开始计算")println(s"[DealFlowBills2]−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−")zk.znodeCreate(s"{topic}offset", s" t o p i c , topic, topic,group,0")
val nor = zk.znodeDataGet(s"KaTeX parse error: Expected 'EOF', got '}' at position 249: …ewOffset)) }̲ /** ******…{line(4)},${line(2)}")
}
.groupByKey()
.map {
case (k, v) =>
val result = v
.flatMap {
fmlines =>
fmlines.split(",").toList.zipWithIndex
}
.groupBy(_.2)
.map {
case (v1, v2) =>
v2.map(._1)
}
(k, result)
}
/* ***************************************************************************************************************
* 保存偏移度部分
* (如果在计算的时候失败了,会接着上一次偏移度进行重算,不保存新的偏移度)
* 计算成功后保存偏移度
/
stream.foreachRDD {
rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition {
iter =>
val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
println(s"[ DealFlowBills2 ] topic: ${o.topic}")
println(s"[ DealFlowBills2 ] partition: ${o.partition} “)
println(s”[ DealFlowBills2 ] fromOffset 开始偏移量: ${o.fromOffset} “)
println(s”[ DealFlowBills2 ] untilOffset 结束偏移量: o . u n t i l O f f s e t 需 要 保 存 的 偏 移 量 , 供 下 次 读 取 使 用 ★ ★ ★ " ) p r i n t l n ( s " [ D e a l F l o w B i l l s 2 ] − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − − " ) / / 写 z o o k e e p e r z k . o f f s e t W o r k ( s " {o.untilOffset} 需要保存的偏移量,供下次读取使用★★★") println(s"[ DealFlowBills2 ] --------------------------------------------------------------------") // 写zookeeper zk.offsetWork(s" o.untilOffset需要保存的偏移量,供下次读取使用★★★")println(s"[DealFlowBills2]−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−")//写zookeeperzk.offsetWork(s"{o.topic}offset", s" o . t o p i c , {o.topic}, o.topic,{o.partition},${o.untilOffset}")
// 写本地文件系统
// val fw = new FileWriter(new File("/home/hadoop1/testjar/test.log"), true)
// fw.write(offsetsRangerStr)
// fw.close()
}
}
/* ***************************************************************************************************************
* 最后结果保存到hdfs
/
result.saveAsTextFiles(output + s"/output/" + “010”)
/* ***************************************************************************************************************
* spark streaming 开始工作
*/
ssc.start()
ssc.awaitTermination()
}
}