package com.manulife.mbps.behavior.data.stream
import java.util.Properties
import com.cloudera.com.amazonaws.util.json.JSONObject
import com.manulife.mbps.behavior.common.utils.{IpUtil, ValidateUtil}
import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}
import kafka.common.TopicAndPartition
import kafka.consumer.SimpleConsumer
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.{ZkClient, ZkConnection}
import org.I0Itec.zkclient.serialize.ZkSerializer
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.{Assign, Subscribe}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by nickliu on 7/5/2017.
*/
object SparkStream_kafka {
def main(args: Array[String]) {
val topic =
val topics = Set(
val group =
val zkHosts =
val brokers =
val sparkConf = new SparkConf().setAppName("Test-SparkDemo-kafka").setMaster("local[3]")
/** 限制每秒钟从topic的每个partition最多消费的消息条数 */
sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "1000")
val ssc = new StreamingContext(sparkConf, Seconds(1))
/** kafka config */
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> group,
"zookeeper.set.acl" -> "false",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
/** 获取zk中offset */
val topicDirs = new ZKGroupTopicDirs(group, topic)
/** consumerOffsetDir:zk中的topic路径,这里是/consumers/example/offsets/ICPRecommend */
val consumerOffsetDir = topicDirs.consumerOffsetDir
/** zookeeper connection */
val zkClient = new ZkClient(zkHosts, 10000, 10000, new ZkSerializer {
override def serialize(data: scala.Any): Array[Byte] = data.asInstanceOf[String].getBytes("UTF-8")
override def deserialize(bytes: Array[Byte]): AnyRef = if (bytes == null) null else new String(bytes, "UTF-8")
})
val zkUtils = new ZkUtils(zkClient, new ZkConnection(zkHosts), false)
val children = zkClient.countChildren(consumerOffsetDir)
var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
var fromOffsets: Map[TopicPartition, Long] = Map()
if (children > 0) {
/** get partition leader begin */
val topicList = List(topic)
/** 得到该topic的一些信息,比如broker,partition分布情况 */
val req = new TopicMetadataRequest(topicList, 0)
/** low level api interface */
val getLeaderConsumer = new SimpleConsumer("master", 9092, 10000, 10000, "OffsetLookup")
val res = getLeaderConsumer.send(req)
val topicMetaOption = res.topicsMetadata.headOption
/** partitions */
val partitions = topicMetaOption match {
case Some(tm) =>
tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String]
case None =>
Map[Int, String]()
}
/** get partition leader end */
/** offset */
for (i <- 0 until children) {
val zkPartitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/${i}")
val tp = new TopicPartition(topic, i)
val tap = new TopicAndPartition(topic, i)
/** additional begin */
val requestMin = OffsetRequest(Map(tap -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1)))
val getMinOffsetConsumer = new SimpleConsumer(partitions(i), 9092, 10000, 10000, "getMinOffset")
val kafkaCurMinOffset = getMinOffsetConsumer.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tap).offsets
var nextOffset = zkPartitionOffset.toLong
if (kafkaCurMinOffset.length > 0 && nextOffset < kafkaCurMinOffset.head) {
/** 如果下一个offset小于当前的offset */
nextOffset = kafkaCurMinOffset.head
}
/** additional end */
fromOffsets += (tp -> nextOffset)
}
kafkaStream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
)
} else {
kafkaStream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
}
kafkaStream.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
/** 将partitionId及起始offset存储到zookeeper中 */
for (offset <- offsetRanges) {
val zkPath = s"${topicDirs.consumerOffsetDir}/${offset.partition}"
zkUtils.updatePersistentPath(zkPath, offset.fromOffset.toString)
}
rdd.foreachPartition(
message => {
while (message.hasNext) {
val msg = message.next()
println("message: " + msg)
sendData(msg)
}
})
}
ssc.start()
ssc.awaitTermination()
}
def sendData(value: ConsumerRecord[String, String]) = {
var events = value.value().toString
var js = new JSONObject(events)
val channel = js.getString("channel")
var ip = js.getString("ip")
var user_id = js.getString("user_id")
var user_id_md5 = js.getString("user_id_md5")
var plan_code = js.getString("plan_code")
var oper_type = js.getString("oper_type")
var catagory = js.getString("category")
var time = js.getString("time")
var mid = js.getString("mid")
ip = ValidateUtil.ipCheck(ip)
var IpObject = IpUtil.ipToBean(IpUtil.ipToLong(ip))
var stat: String = null
var territory: String = null
if (IpObject == null) {
stat = "-"
territory = "-"
} else {
stat = IpObject.getState
territory = IpObject.getTerritory
}
val topic1 =
val brokers =
val props = new Properties()
props.put("metadata.broker.list", brokers)
props.put("serializer.class", "kafka.serializer.StringEncoder")
val kafkaConfig = new ProducerConfig(props)
val producer = new Producer[String, String](kafkaConfig)
while (true) {
val event = new JSONObject()
event.put("user_id", user_id)
.put("channel", channel)
.put("ip", ip)
.put("user_id_md5", user_id_md5)
.put("plan_code", plan_code)
.put("oper_type", oper_type)
.put("category", catagory)
.put("time", time)
.put("stat", stat)
.put("territory", territory)
.put("mid", mid)
producer.send(new KeyedMessage[String, String](topic1, event.toString))
Thread.sleep(200)
println("Message sent: " + event)
}
}
}