功能:将kafka数据读取到HBASE中
步鄹:
1、从kafka提取topicName在zookeeper中的offset
2、循环读取kafka中topicName中的offset且对比zookeeper中的offset
3、将数据提前做好Scala映射类
4、将提取的数据转为DF
5、通过Phoenix存储到Hbase中
备注:博文中使用到了Zookeeper记录kafka的offset,请仔细阅读下面博客,理解会更加深入
http://blog.csdn.net/silentwolfyh/article/details/52985171
ZookeeperCurator框架应用和常用命令
package com.donews.kafka2hbase
import com.donews.utils.{KafkaClusterHelper, ZookeeperHelper,WebLog}
import kafka.common.TopicAndPartition
import kafka.serializer.StringDecoder
import org.apache.commons.cli.{GnuParser, HelpFormatter, Options}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.streaming.kafka.{KafkaUtils, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
/**
* Created by yuhui on 16-09-23.
*
* 功能:将kafka数据读取到HBASE中
* 步鄹:1、从kafka提取topicName在zookeeper中的offset
* 2、循环读取kafka中topicName中的offset且对比zookeeper中的offset
* 3、将数据提前做好Scala映射类
* 4、将提取的数据转为DF
* 5、通过Phoenix存储到Hbase中
* Hbase建表语句:CREATE TABLE WEB_LOG(appkey VARCHAR not null, day VARCHAR,timestamp VARCHAR not null,cookie VARCHAR,short_cookie VARCHAR,request_method VARCHAR, status Integer, http_referer VARCHAR, http_user_agent VARCHAR, http_x_forwarded_for VARCHAR, http_url VARCHAR, to_target VARCHAR,duration Integer,event VARCHAR,is_new Integer, page_id VARCHAR,CONSTRAINT pk PRIMARY KEY (appkey, day ,timestamp, cookie,short_cookie)) SALT_BUCKETS = 20;
*/
object StreamingHbase {
val LOG = LoggerFactory.getLogger(StreamingHbase.getClass)
def main(args: Array[String]): Unit = {
val options = new Options()
.addOption("l", "local", false, "配置为本地模式运行模式")
.addOption("s", "source", true, "数据源,Kafka的Topic名称")
.addOption("h", "help", false, "打印帮助信息")
val parser = new GnuParser()
val cmdLine = parser.parse(options, args)
val processorName = cmdLine.getArgs.headOption.getOrElse("druid")
val isLocal = cmdLine.hasOption("local")
val source = cmdLine.getOptionValue("source")
if (cmdLine.hasOption("h")) {
val formatter = new HelpFormatter()
formatter.printHelp("StreamingHbase", options)
System.exit(0)
}
val topicsSet = source
.split(",")
.filterNot(_ == null)
.map(_.trim)
.filterNot(_.isEmpty)
.toSet
LOG.info("topicSet:{}===》 "+ topicsSet)
val kafkaParams = Map[String, String](
"metadata.broker.list" -> "slave01:9092,slave02:9092,slave03:9092",
"auto.offset.reset" -> "smallest"
)
val kafkaHelper = new KafkaClusterHelper(kafkaParams)
var conf = new SparkConf
if (isLocal) {
conf = conf.setAppName("StreamingHbase")
.setMaster("local[4]").
set("spark.local.dir", "/data/tmp/")
}
val sc = new SparkContext(conf)
val blockSize = 1024 * 1024 * 128 // 128MB
sc.hadoopConfiguration.setInt("dfs.blocksize", blockSize)
sc.hadoopConfiguration.setInt("parquet.block.size", blockSize)
val ctx = new SQLContext(sc)
import ctx.implicits._
ctx.setConf("spark.sql.parquet.mergeSchema", "true")
//程序进入while循环之后,会每小时读取zookeeper中的offset,消费kafka中的数据
while (true) {
var hasMore = false
do {
try {
hasMore=false
//获取Zookeeper中最新的offset,如果第一次则取kafkaParams中的smallest
val offsets = ZookeeperHelper.loadOffsets(topicsSet, kafkaHelper.getFromOffsets(kafkaParams, topicsSet))
// LOG.info("offsets.keySet===》"+offsets.toString())
//获取kafka中最新的offset
val latestOffsets = KafkaClusterHelper.checkErrors(kafkaHelper.getLatestLeaderOffsets(offsets.keySet))
// LOG.info("latestOffsets:"+latestOffsets.toString())
val offsetRanges = offsets.keys.map { tp =>
val fromOffset = offsets(tp)
val latestOffset = latestOffsets(tp).offset
if (latestOffset - fromOffset > 1024 * 1024) {
hasMore = true
}
// LOG.info("fromOffset:"+fromOffset+"===>latestOffset:"+latestOffset)
val chaOffset = latestOffset - fromOffset
// LOG.info("latestOffset - fromOffset:===>"+chaOffset)
OffsetRange(tp, fromOffset, Math.min(fromOffset + 1024 * 1024, latestOffset)) //限制成大约是500M
}.toArray
val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](sc, kafkaParams, offsetRanges)
LOG.info("rdd===================》"+rdd.count()+"条记录")
//将Json数据注册为一个临时表
rdd.map { case (k, v) => v }
.map(WebLog.fromJson)
.filter(message => message != null)
.toDF().registerTempTable("kafkaTable")
//结果数据变成DF
val resultDF = ctx.sql("select appkey AS APPKEY ,day AS DAY , timestamp AS TIMESTAMP,cookie AS COOKIE,short_cookie AS SHORT_COOKIE,request_method AS REQUEST_METHOD," +
"status AS STATUS,http_referer AS HTTP_REFERER,http_user_agent AS HTTP_USER_AGENT,http_x_forwarded_for AS HTTP_X_FORWARDED_FOR," +
"http_url AS HTTP_URL,to_target AS TO_TARGET ,duration AS DURATION,event AS EVENT,is_new AS IS_NEW,page_id AS PAGE_ID from kafkaTable")
resultDF.write.mode(SaveMode.Overwrite).options(
Map("table" -> "WEB_LOG", "zkUrl" -> "slave01:2181;slave02:2181;slave03:2181")
).format("org.apache.phoenix.spark").save()
LOG.info("插入===================》"+resultDF.count()+"条记录")
val nextOffsets = offsetRanges.map(x => (TopicAndPartition(x.topic, x.partition), x.untilOffset)).toMap
//将offset存储到zookeeper,zookeeper存储路径可以删除,保证数据不丢失及数据重新读入
ZookeeperHelper.storeOffsets(nextOffsets)
LOG.info("nextOffsets:"+nextOffsets.toString())
//每小时执行一次
Thread.sleep(1000L * 60 * 60)
} catch {
//程序错误重新运行
case e: Exception => LOG.error(e.getMessage, e)
LOG.info("sleep on error for 5000ms")
Thread.sleep(1000L * 5)
hasMore=true
}
} while (hasMore)
}
}
}
package com.donews.utils
/**
* Created by yuhui on 16-6-29.
* copy from spark-kafka source
*/
import java.util.Properties
import kafka.api._
import kafka.common.{ErrorMapping, OffsetAndMetadata, OffsetMetadataAndError, TopicAndPartition}
import kafka.consumer.{ConsumerConfig, SimpleConsumer}
import org.apache.spark.SparkException
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import scala.util.control.NonFatal
/**
* Convenience methods for interacting with a Kafka cluster.
*
* @param kafkaParams Kafka
* configuration parameters.
* Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
* NOT zookeeper servers, specified in host1:port1,host2:port2 form
*/
class KafkaClusterHelper(val kafkaParams: Map[String, String]) extends Serializable {
import KafkaClusterHelper.{Err, LeaderOffset, SimpleConsumerConfig}
// ConsumerConfig isn't serializable
@transient private var _config: SimpleConsumerConfig = null
def config: SimpleConsumerConfig = this.synchronized {
if (_config == null) {
_config = SimpleConsumerConfig(kafkaParams)
}
_config
}
def connect(host: String, port: Int): SimpleConsumer =
new SimpleConsumer(host, port, config.socketTimeoutMs,
config.socketReceiveBufferBytes, config.clientId)
def connectLeader(topic: String, partition: Int): Either[Err, SimpleConsumer] =
findLeader(topic, partition).right.map(hp => connect(hp._1, hp._2))
// Metadata api
// scalastyle:off
// https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-MetadataAPI
// scalastyle:on
def findLeader(topic: String, partition: Int): Either[Err, (String, Int)] = {
val req = TopicMetadataRequest(TopicMetadataRequest.CurrentVersion,
0, config.clientId, Seq(topic))
val errs = new Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp: TopicMetadataResponse = consumer.send(req)
resp.topicsMetadata.find(_.topic == topic).flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.find(_.partitionId == partition)
}.foreach { pm: PartitionMetadata =>
pm.leader.foreach { leader =>
return Right((leader.host, leader.port))
}
}
}
Left(errs)
}
def findLeaders(
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, (String, Int)]] = {
val topics = topicAndPartitions.map(_.topic)
val response = getPartitionMetadata(topics).right
val answer = response.flatMap { tms: Set[TopicMetadata] =>
val leaderMap = tms.flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.flatMap { pm: PartitionMetadata =>
val tp = TopicAndPartition(tm.topic, pm.partitionId)
if (topicAndPartitions(tp)) {
pm.leader.map { l =>
tp -> (l.host -> l.port)
}
} else {
None
}
}
}.toMap
if (leaderMap.keys.size == topicAndPartitions.size) {
Right(leaderMap)
} else {
val missing = topicAndPartitions.diff(leaderMap.keySet)
val err = new Err
err.append(new SparkException(s"Couldn't find leaders for ${missing}"))
Left(err)
}
}
answer
}
def getPartitions(topics: Set[String]): Either[Err, Set[TopicAndPartition]] = {
getPartitionMetadata(topics).right.map { r =>
r.flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.map { pm: PartitionMetadata =>
TopicAndPartition(tm.topic, pm.partitionId)
}
}
}
}
def getPartitionMetadata(topics: Set[String]): Either[Err, Set[TopicMetadata]] = {
val req = TopicMetadataRequest(
TopicMetadataRequest.CurrentVersion, 0, config.clientId, topics.toSeq)
val errs = new Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp: TopicMetadataResponse = consumer.send(req)
val respErrs = resp.topicsMetadata.filter(m => m.errorCode != ErrorMapping.NoError)
if (respErrs.isEmpty) {
return Right(resp.topicsMetadata.toSet)
} else {
respErrs.foreach { m =>
val cause = ErrorMapping.exceptionFor(m.errorCode)
val msg = s"Error getting partition metadata for '${m.topic}'. Does the topic exist?"
errs.append(new SparkException(msg, cause))
}
}
}
Left(errs)
}
//获取kafka最新的offset
def getLatestLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
getLeaderOffsets(topicAndPartitions, OffsetRequest.LatestTime)
def getEarliestLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
getLeaderOffsets(topicAndPartitions, OffsetRequest.EarliestTime)
def getLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition],
before: Long
): Either[Err, Map[TopicAndPartition, LeaderOffset]] = {
getLeaderOffsets(topicAndPartitions, before, 1).right.map { r =>
r.map { kv =>
// mapValues isnt serializable, see SI-7005
kv._1 -> kv._2.head
}
}
}
private def flip[K, V](m: Map[K, V]): Map[V, Seq[K]] =
m.groupBy(_._2).map { kv =>
kv._1 -> kv._2.keys.toSeq
}
def getLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition],
before: Long,
maxNumOffsets: Int
): Either[Err, Map[TopicAndPartition, Seq[LeaderOffset]]] = {
findLeaders(topicAndPartitions).right.flatMap { tpToLeader =>
val leaderToTp: Map[(String, Int), Seq[TopicAndPartition]] = flip(tpToLeader)
val leaders = leaderToTp.keys
var result = Map[TopicAndPartition, Seq[LeaderOffset]]()
val errs = new Err
withBrokers(leaders, errs) { consumer =>
val partitionsToGetOffsets: Seq[TopicAndPartition] =
leaderToTp((consumer.host, consumer.port))
val reqMap = partitionsToGetOffsets.map { tp: TopicAndPartition =>
tp -> PartitionOffsetRequestInfo(before, maxNumOffsets)
}.toMap
val req = OffsetRequest(reqMap)
val resp = consumer.getOffsetsBefore(req)
val respMap = resp.partitionErrorAndOffsets
partitionsToGetOffsets.foreach { tp: TopicAndPartition =>
respMap.get(tp).foreach { por: PartitionOffsetsResponse =>
if (por.error == ErrorMapping.NoError) {
if (por.offsets.nonEmpty) {
result += tp -> por.offsets.map { off =>
LeaderOffset(consumer.host, consumer.port, off)
}
} else {
errs.append(new SparkException(
s"Empty offsets for ${tp}, is ${before} before log beginning?"))
}
} else {
errs.append(ErrorMapping.exceptionFor(por.error))
}
}
}
if (result.keys.size == topicAndPartitions.size) {
return Right(result)
}
}
val missing = topicAndPartitions.diff(result.keySet)
errs.append(new SparkException(s"Couldn't find leader offsets for ${missing}"))
Left(errs)
}
}
// Consumer offset api
// scalastyle:off
// https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
// scalastyle:on
// this 0 here indicates api version, in this case the original ZK backed api.
private def defaultConsumerApiVersion: Short = 0
/** Requires Kafka >= 0.8.1.1 */
def getConsumerOffsets(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, Long]] =
getConsumerOffsets(groupId, topicAndPartitions, defaultConsumerApiVersion)
def getConsumerOffsets(
groupId: String,
topicAndPartitions: Set[TopicAndPartition],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, Long]] = {
getConsumerOffsetMetadata(groupId, topicAndPartitions, consumerApiVersion).right.map { r =>
r.map { kv =>
kv._1 -> kv._2.offset
}
}
}
/** Requires Kafka >= 0.8.1.1 */
def getConsumerOffsetMetadata(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] =
getConsumerOffsetMetadata(groupId, topicAndPartitions, defaultConsumerApiVersion)
def getConsumerOffsetMetadata(
groupId: String,
topicAndPartitions: Set[TopicAndPartition],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] = {
var result = Map[TopicAndPartition, OffsetMetadataAndError]()
val req = OffsetFetchRequest(groupId, topicAndPartitions.toSeq, consumerApiVersion)
val errs = new Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp = consumer.fetchOffsets(req)
val respMap = resp.requestInfo
val needed = topicAndPartitions.diff(result.keySet)
needed.foreach { tp: TopicAndPartition =>
respMap.get(tp).foreach { ome: OffsetMetadataAndError =>
if (ome.error == ErrorMapping.NoError) {
result += tp -> ome
} else {
errs.append(ErrorMapping.exceptionFor(ome.error))
}
}
}
if (result.keys.size == topicAndPartitions.size) {
return Right(result)
}
}
val missing = topicAndPartitions.diff(result.keySet)
errs.append(new SparkException(s"Couldn't find consumer offsets for ${missing}"))
Left(errs)
}
/** Requires Kafka >= 0.8.1.1 */
def setConsumerOffsets(
groupId: String,
offsets: Map[TopicAndPartition, Long]
): Either[Err, Map[TopicAndPartition, Short]] =
setConsumerOffsets(groupId, offsets, defaultConsumerApiVersion)
def setConsumerOffsets(
groupId: String,
offsets: Map[TopicAndPartition, Long],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, Short]] = {
val meta = offsets.map { kv =>
kv._1 -> OffsetAndMetadata(kv._2)
}
setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)
}
/** Requires Kafka >= 0.8.1.1 */
def setConsumerOffsetMetadata(
groupId: String,
metadata: Map[TopicAndPartition, OffsetAndMetadata]
): Either[Err, Map[TopicAndPartition, Short]] =
setConsumerOffsetMetadata(groupId, metadata, defaultConsumerApiVersion)
def setConsumerOffsetMetadata(
groupId: String,
metadata: Map[TopicAndPartition, OffsetAndMetadata],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, Short]] = {
var result = Map[TopicAndPartition, Short]()
val req = OffsetCommitRequest(groupId, metadata, consumerApiVersion)
val errs = new Err
val topicAndPartitions = metadata.keySet
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp = consumer.commitOffsets(req)
val respMap = resp.commitStatus
val needed = topicAndPartitions.diff(result.keySet)
needed.foreach { tp: TopicAndPartition =>
respMap.get(tp).foreach { err: Short =>
if (err == ErrorMapping.NoError) {
result += tp -> err
} else {
errs.append(ErrorMapping.exceptionFor(err))
}
}
}
if (result.keys.size == topicAndPartitions.size) {
return Right(result)
}
}
val missing = topicAndPartitions.diff(result.keySet)
errs.append(new SparkException(s"Couldn't set offsets for ${missing}"))
Left(errs)
}
// Try a call against potentially multiple brokers, accumulating errors
private def withBrokers(brokers: Iterable[(String, Int)], errs: Err)
(fn: SimpleConsumer => Any): Unit = {
brokers.foreach { hp =>
var consumer: SimpleConsumer = null
try {
consumer = connect(hp._1, hp._2)
fn(consumer)
} catch {
case NonFatal(e) =>
errs.append(e)
} finally {
if (consumer != null) {
consumer.close()
}
}
}
}
//获取kafka最开始的offset
def getFromOffsets(kafkaParams: Map[String, String], topics: Set[String]): Map[TopicAndPartition, Long] = {
val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
val result = for {
topicPartitions <- getPartitions(topics).right
leaderOffsets <- (if (reset == Some("smallest")) {
getEarliestLeaderOffsets(topicPartitions)
} else {
getLatestLeaderOffsets(topicPartitions)
}).right
} yield {
leaderOffsets.map { case (tp, lo) =>
(tp, lo.offset)
}
}
KafkaClusterHelper.checkErrors(result)
}
}
object KafkaClusterHelper {
type Err = ArrayBuffer[Throwable]
/** If the result is right, return it, otherwise throw SparkException */
def checkErrors[T](result: Either[Err, T]): T = {
result.fold(
errs => throw new SparkException(errs.mkString("\n")),
ok => ok
)
}
case class LeaderOffset(host: String, port: Int, offset: Long)
/**
* High-level kafka consumers connect to ZK. ConsumerConfig assumes this use case.
* Simple consumers connect directly to brokers, but need many of the same configs.
* This subclass won't warn about missing ZK params, or presence of broker params.
*/
class SimpleConsumerConfig private(brokers: String, originalProps: Properties)
extends ConsumerConfig(originalProps) {
val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp =>
val hpa = hp.split(":")
if (hpa.size == 1) {
throw new SparkException(s"Broker not in the correct format of : [$brokers]")
}
(hpa(0), hpa(1).toInt)
}
}
object SimpleConsumerConfig {
/**
* Make a consumer config without requiring group.id or zookeeper.connect,
* since communicating with brokers also needs common settings such as timeout
*/
def apply(kafkaParams: Map[String, String]): SimpleConsumerConfig = {
// These keys are from other pre-existing kafka configs for specifying brokers, accept either
val brokers = kafkaParams.get("metadata.broker.list")
.orElse(kafkaParams.get("bootstrap.servers"))
.getOrElse(throw new SparkException(
"Must specify metadata.broker.list or bootstrap.servers"))
val props = new Properties()
kafkaParams.foreach { case (key, value) =>
// prevent warnings on parameters ConsumerConfig doesn't know about
if (key != "metadata.broker.list" && key != "bootstrap.servers") {
props.put(key, value)
}
}
Seq("zookeeper.connect", "group.id").foreach { s =>
if (!props.containsKey(s)) {
props.setProperty(s, "")
}
}
new SimpleConsumerConfig(brokers, props)
}
}
def main(args: Array[String]) {
val kafkaParams = Map[String, String](
"metadata.broker.list" -> "spark-slave03:9092,spark-slave04:9092,spark-slave05:9092",
"auto.offset.reset" -> "smallest"
)
val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
println(reset == Some("smallest"))
}
}
package com.donews.utils
import java.util.Properties
import com.typesafe.config.{Config, ConfigFactory}
object WebConfig {
private val conf: Config = ConfigFactory.load()
lazy val KAFKA_BROKER_LIST = conf.getString("kafka.metadata.broker.list")
lazy val ZOOKEEPER_CONNECT = conf.getString("zookeeper.connect")
lazy val HBASE_URL = conf.getString("Hbase.url")
lazy val ZOOKEEPER_OFFSET = conf.getString("zookeeper.offset")
/***
* 加载配置文件
* @param config
* @return
*/
private def propsFromConfig(config: Config): Properties = {
import scala.collection.JavaConversions._
val props = new Properties()
val map: Map[String, Object] = config.entrySet().map({ entry =>
entry.getKey -> entry.getValue.unwrapped()
})(collection.breakOut)
props.putAll(map)
props
}
}
package com.donews.utils
import java.util.Properties
import com.typesafe.config.{Config, ConfigFactory}
object WebConfig {
private val conf: Config = ConfigFactory.load()
lazy val KAFKA_BROKER_LIST = conf.getString("kafka.metadata.broker.list")
lazy val ZOOKEEPER_CONNECT = conf.getString("zookeeper.connect")
lazy val HBASE_URL = conf.getString("Hbase.url")
lazy val ZOOKEEPER_OFFSET = conf.getString("zookeeper.offset")
/***
* 加载配置文件
* @param config
* @return
*/
private def propsFromConfig(config: Config): Properties = {
import scala.collection.JavaConversions._
val props = new Properties()
val map: Map[String, Object] = config.entrySet().map({ entry =>
entry.getKey -> entry.getValue.unwrapped()
})(collection.breakOut)
props.putAll(map)
props
}
}
package com.donews.utils
import com.fasterxml.jackson.annotation.JsonIgnoreProperties
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.slf4j.LoggerFactory
/**
* Created by yuhui on 16-9-20
* 将kafka数据转为对象返回
*/
@JsonIgnoreProperties(ignoreUnknown = true)
case class WebLog(
var appkey: String,
timestamp: String,
cookie: String,
short_cookie: String,
request_method: String,
status: java.lang.Integer,
http_referer: String,
http_user_agent: String,
http_x_forwarded_for: String,
http_url: String,
to_target: String,
duration: java.lang.Integer,
event: String,
is_new: java.lang.Integer,
page_id: String,
var day: String
)
/***
* 将kafka消费出来的每行数据,进行解析过滤,最后转为对象
*/
object WebLog {
private val LOG = LoggerFactory.getLogger(WebLog.getClass)
val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)
def fromJson(value: String): WebLog = {
if (value == null) return null
try {
val obj = mapper.readValue(value, classOf[WebLog])
if(obj.appkey == null || "".equals(obj.appkey)){obj.appkey="donews_website"}
if (obj.timestamp == null||"".equals( obj.timestamp )|| obj.cookie== null ||"".equals( obj.cookie ) ) {return null}
obj
} catch {
case e: Exception =>
LOG.info(e.getMessage, e)
null
}
}
}
package com.donews.utils
import kafka.common.TopicAndPartition
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.slf4j.LoggerFactory
import scala.collection.JavaConversions._
/**
* Created by yuhui on 16-6-8.
*/
object ZookeeperHelper {
val LOG = LoggerFactory.getLogger(ZookeeperHelper.getClass)
val client = {
val client = CuratorFrameworkFactory
.builder
.connectString(WebConfig.ZOOKEEPER_CONNECT)
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("webstatistic_test")
.build()
client.start()
client
}
//zookeeper创建路径
def ensurePathExists(path: String): Unit = {
if (client.checkExists().forPath(path) == null) {
client.create().creatingParentsIfNeeded().forPath(path)
}
}
//zookeeper加载offset的方法
def loadOffsets(topicSet: Set[String], defaultOffset: Map[TopicAndPartition, Long]): Map[TopicAndPartition, Long] = {
val kafkaOffsetPath = s"/kafkaOffsets"
ensurePathExists(kafkaOffsetPath)
val offsets = for {
//t就是路径webstatistic/kafkaOffsets下面的子目录遍历
t <- client.getChildren.forPath(kafkaOffsetPath)
if topicSet.contains(t)
//p就是新路径 /webstatistic/kafkaOffsets/donews_website
p <- client.getChildren.forPath(s"$kafkaOffsetPath/$t")
} yield {
//遍历路径下面的partition中的offset
val data = client.getData.forPath(s"$kafkaOffsetPath/$t/$p")
//将data变成Long类型
val offset = java.lang.Long.valueOf(new String(data)).toLong
(TopicAndPartition(t, Integer.parseInt(p)), offset)
}
defaultOffset ++ offsets.toMap
}
//zookeeper存储offset的方法
def storeOffsets(offsets: Map[TopicAndPartition, Long]): Unit = {
val kafkaOffsetPath = s"/kafkaOffsets"
if (client.checkExists().forPath(kafkaOffsetPath) == null) {
client.create().creatingParentsIfNeeded().forPath(kafkaOffsetPath)
}
for ((tp, offset) <- offsets) {
val data = String.valueOf(offset).getBytes
val path = s"$kafkaOffsetPath/${tp.topic}/${tp.partition}"
ensurePathExists(path)
client.setData().forPath(path, data)
}
}
}