Spark-读写HBase

原文链接: https://me.csdn.net/xianpanjia4616

Spark-读写HBase

  • 1.sparkstreaming实时写入Hbase(saveAsNewAPIHadoopDataset方法)
  • 2.sparkstreaming整合kafka实现exactly-once语义
  • 3.sparkstreaming同时消费多个topic的数据实现exactly-once的语义
  • 4.spark读取hbase数据(newAPIHadoopRDD方式)
  • 5.spark读取hbase中的数据
  • 6.spark将数据写入到hbase

(1-4)原文地址:JasonLee’blog
(5-6)原文地址:Lu_Xiao_Yue

1.sparkstreaming实时写入Hbase(saveAsNewAPIHadoopDataset方法)

import kafka.PropertiesScalaUtils
import net.sf.json.JSONObject
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import spark.wordcount.kafkaStreams
 
/**
  * sparkstreaming写入hbase新的API;
  */
object sparkToHbase {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Hbase Test")
    val scc = new StreamingContext(conf, Seconds(1))
    val sc = scc.sparkContext
    val tablename = "test"
    val mode = args(0).toString
    val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
    val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
    val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
    val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
    val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
    val topic = PropertiesScalaUtils.loadProperties("topic_combine",mode)
    val broker = PropertiesScalaUtils.loadProperties("broker",mode)
    sc.hadoopConfiguration.set("hbase.zookeeper.quorum",zk_hbase)
    sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", zk_port)
    sc.hadoopConfiguration.set("hbase.master", hbase_master)
    sc.hadoopConfiguration.set("hbase.defaults.for.version.skip", "true")
    sc.hadoopConfiguration.set("hhbase.rootdir", hbase_rootdir)
    sc.hadoopConfiguration.set("zookeeper.znode.parent", zookeeper_znode_parent)
    sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
    val job = Job.getInstance(sc.hadoopConfiguration)
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setOutputValueClass(classOf[Result])
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    val topicSet = Set(topic)
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",   //latest;earliest
      "value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> broker
      , "group.id" -> "jason_test"
      , "enable.auto.commit" -> (true: java.lang.Boolean)
    )
    kafkaStreams = KafkaUtils.createDirectStream[String, String](
      scc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
    try {
      kafkaStreams.foreachRDD(rdd => {
        if(!rdd.isEmpty()){
          val save_rdd = rdd.map(x => {
            val json = JSONObject.fromObject(x.value())
            val put = new Put(Bytes.toBytes(json.get("rowkey").toString))
            insert_hb(json,put)
            (new ImmutableBytesWritable, put)
          })
          save_rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
        }
      })
    }catch {
      case e:Exception => println("报错了")
    }
    scc.start()
    scc.awaitTermination()
  }
  def insert_hb(json: JSONObject, onePut: Put): Unit = {
    val keys = json.keySet
    val iterator_redis = keys.iterator
    while (iterator_redis.hasNext) {
      val hb_col = iterator_redis.next().toString
      val col_value = json.get(hb_col).toString
      onePut.addColumn(Bytes.toBytes("f1"), Bytes.toBytes(hb_col), Bytes.toBytes(col_value))
    }
  }
}

2.sparkstreaming整合kafka实现exactly-once语义

手动维护kafka的offest
为了实现exactly-once的语义,我采用自己保存offest的方法,offest可以保存在zk,kafka,mysql,hbase,redis中自己根据情况而定,我选择把offest保存到redis中.创建Dstream之前,先判断是否消费过,如果没有消费就从头开始,如果已经消费过了,就从上次保存的offest处开始消费。

spark版本2.2.0,scala版本2.11.8,kafka版本0.10.1,hbase版本1.1.2.。

package test
 
import java.util
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.SparkStreamingKafka.{dbIndex, kafkaStreams}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import redis.RedisPool
 
object sparkstreaming {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
    Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
    val conf = new SparkConf().setAppName("sparkstreaming")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.streaming.concurrentJobs", "10")
    conf.set("spark.streaming.kafka.maxRetries", "50")
    val scc = new StreamingContext(conf, Seconds(5))
    val topic = PropertiesScalaUtils.loadProperties("topic")
    val topicSet: Set[String] = Set(topic)
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",
      "value.deserializer" -> classOf[StringDeserializer]
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
      , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
      , "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val maxTotal = 200
    val maxIdle = 100
    val minIdle = 10
    val testOnBorrow = false
    val testOnReturn = false
    val maxWaitMillis = 500
    RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
    val jedis = RedisPool.getPool.getResource
    jedis.select(dbIndex)
    val keys: util.Set[String] = jedis.keys(topic + "*")
    if (keys.size() == 0) {
      kafkaStreams = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
    } else {
      val fromOffsets: Map[TopicPartition, Long] = RedisKeysListUtils.getKeysList(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, topic)
      kafkaStreams = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, fromOffsets))
    }
    RedisPool.getPool.returnResource(jedis)
    kafkaStreams.foreachRDD(rdd=>{
      if (!rdd.isEmpty()) {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition(partiton=>{
        val conf = HBaseConfiguration.create()
        conf.set("hbase.zookeeper.quorum", PropertiesScalaUtils.loadProperties("zk_hbase")) //zk的地址;
        conf.set("hbase.zookeeper.property.clientPort", PropertiesScalaUtils.loadProperties("zk_port"))
        conf.set("hbase.master", PropertiesScalaUtils.loadProperties("hbase_master"))
        conf.set("hbase.defaults.for.version.skip", "true")
        conf.set("hhbase.rootdir", PropertiesScalaUtils.loadProperties("hbase_rootdir"))
        conf.set("zookeeper.znode.parent", PropertiesScalaUtils.loadProperties("zookeeper_znode_parent"))
        myTable = new HTable(conf, TableName.valueOf(PropertiesScalaUtils.loadProperties("hbase_table")))
        myTable.setAutoFlush(false, false) //关闭自动提交
        myTable.setWriteBufferSize(3 * 1024 * 1024)
        partiton.foreach(pair=>{
          //自己的处理逻辑;
        })
        myTable.flushCommits()
        myTable.close()
        offsetRanges.foreach { offsetRange =>
          println("partition : " + offsetRange.partition + " fromOffset:  " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
          val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
          jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
        })
       }
      })
      scc.start()
      scc.awaitTermination()
    }
  }
 

3.sparkstreaming同时消费多个topic的数据实现exactly-once的语义

offest存到redis里了,当然也可以保存在zk,kafka,mysql,hbase中都可以。

用了3个topic,每个topic5个partition。

package spark
 
import java.io.File
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.streamingRedisHive.{dbIndex}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import redis.RedisPool
 
object moreTopic {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
    Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
    val warehouseLocation = new File("hdfs://cluster/hive/warehouse").getAbsolutePath
    val spark = SparkSession.builder().appName("Spark Jason").config("spark.sql.warehouse.dir",    warehouseLocation).enableHiveSupport().getOrCreate()
    spark.conf.set("spark.streaming.concurrentJobs", 10)
    spark.conf.set("spark.streaming.kafka.maxRetries", 50)
    spark.conf.set("spark.streaming.stopGracefullyOnShutdown",true)
    spark.conf.set("spark.streaming.backpressure.enabled",true)
    spark.conf.set("spark.streaming.backpressure.initialRate",5000)
    spark.conf.set("spark.streaming.kafka.maxRatePerPartition", 3000)
    @transient
    val sc = spark.sparkContext
    val scc = new StreamingContext(sc, Seconds(2))
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",
      "value.deserializer" -> classOf[StringDeserializer]
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
      , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
      , "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    var stream: InputDStream[ConsumerRecord[String, String]] = null
    val topics = Array("jason_20180519", "jason_0606","jason_test")
    val maxTotal = 200
    val maxIdle = 100
    val minIdle = 10
    val testOnBorrow = false
    val testOnReturn = false
    val maxWaitMillis = 5000
    RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
    val jedis = RedisPool.getPool.getResource
    jedis.select(dbIndex)
    val keys = jedis.keys(topics(0) + "*")
    val keys_2 = jedis.keys(topics(1) +"*")
    val keys_3 = jedis.keys(topics(2) +"*")
    if(keys.size() == 0 && keys_2.size() == 0 && keys_3.size() == 0){
      println("第一次启动,从头开始消费数据-----------------------------------------------------------")
      stream = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )
    }else{
      println("不是第一次启动,从上次的offest开始消费数据-----------------------------------------------")
      stream = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, RedisKeysListUtils.getRedisOffest(topics,jedis)))
    }
    jedis.close()
    stream.foreachRDD(rdd=>{
      if (!rdd.isEmpty()) {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition(partition=>{
        val o = offsetRanges(TaskContext.get.partitionId)
        println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
        val jedis_jason = RedisPool.getPool.getResource
        jedis_jason.select(dbIndex)
        partition.foreach(pair=>{
          //自己的计算逻辑;
        })
        offsetRanges.foreach { offsetRange =>
          println("partition : " + offsetRange.partition + " fromOffset:  " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
          val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
          jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
        }
        jedis_jason.close()
      })
     }
    })
    scc.start()
    scc.awaitTermination()
  }
}

4.spark读取hbase数据(newAPIHadoopRDD方式)

package hbase
 
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.log4j.{Level, Logger}
import util.PropertiesScalaUtils
import org.apache.spark.sql.SparkSession
 
/**
  * spark读取hbase的数据
  */
object ReadHbase {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    val spark = SparkSession
      .builder
      .appName("read hbase")
      .master("local[4]")
      .config("spark.some.config.option", "config-value")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .getOrCreate
    val sc = spark.sparkContext
    val mode = "local"
    val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
    val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
    val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
    val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
    val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
    val hbase_table = PropertiesScalaUtils.loadProperties("hbase_table",mode)
 
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", zk_hbase)
    conf.set("hbase.zookeeper.property.clientPort", zk_port)
    conf.set("hbase.master", hbase_master)
    conf.set("hbase.defaults.for.version.skip", "true")
    conf.set("hhbase.rootdir", hbase_rootdir)
    conf.set("zookeeper.znode.parent", zookeeper_znode_parent)
    conf.set(TableInputFormat.INPUT_TABLE, "cbd:prod_base")
    val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])
    hbaseRDD.sample(false,0.1)foreachPartition(fp=>{
      fp.foreach(f=>{
        val rowkey = Bytes.toString(f._2.getRow)
        val InsertTime = Bytes.toString(f._2.getValue("cf1".getBytes,"InsertTime".getBytes))
        val VipPrice = Bytes.toString(f._2.getValue("cf1".getBytes,"VipPrice".getBytes))
        println(s"Row key:$rowkey InsertTime:$InsertTime VipPrice:$VipPrice")
      })
    })
    println("元素的个数:"+hbaseRDD.count())
    sc.stop()
  }
}

5.spark读取hbase中的数据

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object HbaseRdd1 {

  def main(args: Array[String]): Unit = {
    val conf = HBaseConfiguration.create()
    val sc = new SparkContext(new SparkConf())
    //设置查询的表名
    conf.set(TableInputFormat.INPUT_TABLE, "student")
    // hbase
    val stuRDD: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    val count = stuRDD.count()
    println("Students RDD Count:" + count)
    stuRDD.cache()

    //遍历输出
    stuRDD.foreach({ case (_,result) =>
      val key = Bytes.toString(result.getRow)
      val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes))
      val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes))
      val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes))
      println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age)
    })
  }
}

6.spark将数据写入到hbase

import org.apache.hadoop.hbase.HBaseConfiguration  
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat  
import org.apache.spark._  
import org.apache.hadoop.mapreduce.Job  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable  
import org.apache.hadoop.hbase.client.Result  
import org.apache.hadoop.hbase.client.Put  
import org.apache.hadoop.hbase.util.Bytes  

object SparkWriteHBase {  

  def main(args: Array[String]): Unit = {  
    val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local")  
    val sc = new SparkContext(sparkConf)        
    val tablename = "student"        
    sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)  

    val job = new Job(sc.hadoopConfiguration)  
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])  
    job.setOutputValueClass(classOf[Result])    
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])    

    val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录
    val rdd = indataRDD.map(_.split(',')).map{arr=>{  
      val put = new Put(Bytes.toBytes(arr(0))) //行健的值 
      put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))  //info:name列的值
      put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2)))  //info:gender列的值
            put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt))  //info:age列的值
      (new ImmutableBytesWritable, put)   
    }}        
    rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())  
  }    
}  

你可能感兴趣的:(大数据,➹➹➹⑤Spark,➹➹➹②HBase)