(1-4)原文地址:JasonLee’blog
(5-6)原文地址:Lu_Xiao_Yue
import kafka.PropertiesScalaUtils
import net.sf.json.JSONObject
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import spark.wordcount.kafkaStreams
/**
* sparkstreaming写入hbase新的API;
*/
object sparkToHbase {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Hbase Test")
val scc = new StreamingContext(conf, Seconds(1))
val sc = scc.sparkContext
val tablename = "test"
val mode = args(0).toString
val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
val topic = PropertiesScalaUtils.loadProperties("topic_combine",mode)
val broker = PropertiesScalaUtils.loadProperties("broker",mode)
sc.hadoopConfiguration.set("hbase.zookeeper.quorum",zk_hbase)
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", zk_port)
sc.hadoopConfiguration.set("hbase.master", hbase_master)
sc.hadoopConfiguration.set("hbase.defaults.for.version.skip", "true")
sc.hadoopConfiguration.set("hhbase.rootdir", hbase_rootdir)
sc.hadoopConfiguration.set("zookeeper.znode.parent", zookeeper_znode_parent)
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = Job.getInstance(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val topicSet = Set(topic)
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest", //latest;earliest
"value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> broker
, "group.id" -> "jason_test"
, "enable.auto.commit" -> (true: java.lang.Boolean)
)
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
try {
kafkaStreams.foreachRDD(rdd => {
if(!rdd.isEmpty()){
val save_rdd = rdd.map(x => {
val json = JSONObject.fromObject(x.value())
val put = new Put(Bytes.toBytes(json.get("rowkey").toString))
insert_hb(json,put)
(new ImmutableBytesWritable, put)
})
save_rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
})
}catch {
case e:Exception => println("报错了")
}
scc.start()
scc.awaitTermination()
}
def insert_hb(json: JSONObject, onePut: Put): Unit = {
val keys = json.keySet
val iterator_redis = keys.iterator
while (iterator_redis.hasNext) {
val hb_col = iterator_redis.next().toString
val col_value = json.get(hb_col).toString
onePut.addColumn(Bytes.toBytes("f1"), Bytes.toBytes(hb_col), Bytes.toBytes(col_value))
}
}
}
手动维护kafka的offest
为了实现exactly-once的语义,我采用自己保存offest的方法,offest可以保存在zk,kafka,mysql,hbase,redis中自己根据情况而定,我选择把offest保存到redis中.创建Dstream之前,先判断是否消费过,如果没有消费就从头开始,如果已经消费过了,就从上次保存的offest处开始消费。
spark版本2.2.0,scala版本2.11.8,kafka版本0.10.1,hbase版本1.1.2.。
package test
import java.util
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.SparkStreamingKafka.{dbIndex, kafkaStreams}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import redis.RedisPool
object sparkstreaming {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
val conf = new SparkConf().setAppName("sparkstreaming")
conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.streaming.concurrentJobs", "10")
conf.set("spark.streaming.kafka.maxRetries", "50")
val scc = new StreamingContext(conf, Seconds(5))
val topic = PropertiesScalaUtils.loadProperties("topic")
val topicSet: Set[String] = Set(topic)
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest",
"value.deserializer" -> classOf[StringDeserializer]
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
, "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
, "enable.auto.commit" -> (false: java.lang.Boolean)
)
val maxTotal = 200
val maxIdle = 100
val minIdle = 10
val testOnBorrow = false
val testOnReturn = false
val maxWaitMillis = 500
RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
val jedis = RedisPool.getPool.getResource
jedis.select(dbIndex)
val keys: util.Set[String] = jedis.keys(topic + "*")
if (keys.size() == 0) {
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
} else {
val fromOffsets: Map[TopicPartition, Long] = RedisKeysListUtils.getKeysList(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, topic)
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, fromOffsets))
}
RedisPool.getPool.returnResource(jedis)
kafkaStreams.foreachRDD(rdd=>{
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition(partiton=>{
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", PropertiesScalaUtils.loadProperties("zk_hbase")) //zk的地址;
conf.set("hbase.zookeeper.property.clientPort", PropertiesScalaUtils.loadProperties("zk_port"))
conf.set("hbase.master", PropertiesScalaUtils.loadProperties("hbase_master"))
conf.set("hbase.defaults.for.version.skip", "true")
conf.set("hhbase.rootdir", PropertiesScalaUtils.loadProperties("hbase_rootdir"))
conf.set("zookeeper.znode.parent", PropertiesScalaUtils.loadProperties("zookeeper_znode_parent"))
myTable = new HTable(conf, TableName.valueOf(PropertiesScalaUtils.loadProperties("hbase_table")))
myTable.setAutoFlush(false, false) //关闭自动提交
myTable.setWriteBufferSize(3 * 1024 * 1024)
partiton.foreach(pair=>{
//自己的处理逻辑;
})
myTable.flushCommits()
myTable.close()
offsetRanges.foreach { offsetRange =>
println("partition : " + offsetRange.partition + " fromOffset: " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
})
}
})
scc.start()
scc.awaitTermination()
}
}
offest存到redis里了,当然也可以保存在zk,kafka,mysql,hbase中都可以。
用了3个topic,每个topic5个partition。
package spark
import java.io.File
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.streamingRedisHive.{dbIndex}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import redis.RedisPool
object moreTopic {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
val warehouseLocation = new File("hdfs://cluster/hive/warehouse").getAbsolutePath
val spark = SparkSession.builder().appName("Spark Jason").config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate()
spark.conf.set("spark.streaming.concurrentJobs", 10)
spark.conf.set("spark.streaming.kafka.maxRetries", 50)
spark.conf.set("spark.streaming.stopGracefullyOnShutdown",true)
spark.conf.set("spark.streaming.backpressure.enabled",true)
spark.conf.set("spark.streaming.backpressure.initialRate",5000)
spark.conf.set("spark.streaming.kafka.maxRatePerPartition", 3000)
@transient
val sc = spark.sparkContext
val scc = new StreamingContext(sc, Seconds(2))
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest",
"value.deserializer" -> classOf[StringDeserializer]
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
, "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
, "enable.auto.commit" -> (false: java.lang.Boolean)
)
var stream: InputDStream[ConsumerRecord[String, String]] = null
val topics = Array("jason_20180519", "jason_0606","jason_test")
val maxTotal = 200
val maxIdle = 100
val minIdle = 10
val testOnBorrow = false
val testOnReturn = false
val maxWaitMillis = 5000
RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
val jedis = RedisPool.getPool.getResource
jedis.select(dbIndex)
val keys = jedis.keys(topics(0) + "*")
val keys_2 = jedis.keys(topics(1) +"*")
val keys_3 = jedis.keys(topics(2) +"*")
if(keys.size() == 0 && keys_2.size() == 0 && keys_3.size() == 0){
println("第一次启动,从头开始消费数据-----------------------------------------------------------")
stream = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
}else{
println("不是第一次启动,从上次的offest开始消费数据-----------------------------------------------")
stream = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, RedisKeysListUtils.getRedisOffest(topics,jedis)))
}
jedis.close()
stream.foreachRDD(rdd=>{
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition(partition=>{
val o = offsetRanges(TaskContext.get.partitionId)
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
val jedis_jason = RedisPool.getPool.getResource
jedis_jason.select(dbIndex)
partition.foreach(pair=>{
//自己的计算逻辑;
})
offsetRanges.foreach { offsetRange =>
println("partition : " + offsetRange.partition + " fromOffset: " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
}
jedis_jason.close()
})
}
})
scc.start()
scc.awaitTermination()
}
}
package hbase
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.log4j.{Level, Logger}
import util.PropertiesScalaUtils
import org.apache.spark.sql.SparkSession
/**
* spark读取hbase的数据
*/
object ReadHbase {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
val spark = SparkSession
.builder
.appName("read hbase")
.master("local[4]")
.config("spark.some.config.option", "config-value")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate
val sc = spark.sparkContext
val mode = "local"
val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
val hbase_table = PropertiesScalaUtils.loadProperties("hbase_table",mode)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", zk_hbase)
conf.set("hbase.zookeeper.property.clientPort", zk_port)
conf.set("hbase.master", hbase_master)
conf.set("hbase.defaults.for.version.skip", "true")
conf.set("hhbase.rootdir", hbase_rootdir)
conf.set("zookeeper.znode.parent", zookeeper_znode_parent)
conf.set(TableInputFormat.INPUT_TABLE, "cbd:prod_base")
val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
hbaseRDD.sample(false,0.1)foreachPartition(fp=>{
fp.foreach(f=>{
val rowkey = Bytes.toString(f._2.getRow)
val InsertTime = Bytes.toString(f._2.getValue("cf1".getBytes,"InsertTime".getBytes))
val VipPrice = Bytes.toString(f._2.getValue("cf1".getBytes,"VipPrice".getBytes))
println(s"Row key:$rowkey InsertTime:$InsertTime VipPrice:$VipPrice")
})
})
println("元素的个数:"+hbaseRDD.count())
sc.stop()
}
}
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object HbaseRdd1 {
def main(args: Array[String]): Unit = {
val conf = HBaseConfiguration.create()
val sc = new SparkContext(new SparkConf())
//设置查询的表名
conf.set(TableInputFormat.INPUT_TABLE, "student")
// hbase
val stuRDD: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count = stuRDD.count()
println("Students RDD Count:" + count)
stuRDD.cache()
//遍历输出
stuRDD.foreach({ case (_,result) =>
val key = Bytes.toString(result.getRow)
val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes))
val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes))
val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes))
println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age)
})
}
}
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.spark._
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
object SparkWriteHBase {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "student"
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = new Job(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录
val rdd = indataRDD.map(_.split(',')).map{arr=>{
val put = new Put(Bytes.toBytes(arr(0))) //行健的值
put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1))) //info:name列的值
put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2))) //info:gender列的值
put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt)) //info:age列的值
(new ImmutableBytesWritable, put)
}}
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
}