Spark对Hbase 的封装 connector

传统方式spark写Hbase的方式为这种方式就是常用的TableInputFormat和TableOutputFormat来读写hbase;本文用SparkOnHbase基于GIT上Clouder开源出来的方法,依赖如下:


<dependency>
    <groupId>org.apache.hbasegroupId>
    <artifactId>hbase-sparkartifactId>
    <version>1.2.0-cdh5.12.1version>
dependency>

另外hortonworks-spark 也有spark和Hbase的connector大家可自行编译但是感觉没有Clouder好用
https://github.com/hortonworks-spark/shc#apache-spark—apache-hbase-connector

华为也有spark-Hbase的封装 https://github.com/Huawei-Spark/Spark-SQL-on-HBase spark-hbase


  val spark=SparkSession.builder().appName("").master("").getOrCreate()
  val sc=spark.sparkContext
  val conf= HBaseConfiguration.create()
  val habsecontext=new HBaseContext(sc,conf)


  def scanHbaseTB(tableName:String)(implicit startKey:Option[String],endKey:Option[String]):RDD[(ImmutableBytesWritable,Result)]={
  //如果有StartRowKey根据提供查询
    startKey match {
      case Some(x)=>{
        val scan=new Scan()
        scan.setStartRow(Bytes.toBytes(x))
        scan.setStopRow(Bytes.toBytes(endKey.getOrElse(x)))
        val hbaeRDD=habsecontext.hbaseRDD(TableName.valueOf(tableName),scan)
        hbaeRDD
      }
      case None=>{
        val scan=new Scan()
        val hbaeRDD=habsecontext.hbaseRDD(TableName.valueOf(tableName),scan)
        hbaeRDD
      }
    }


    def main(args: Array[String]): Unit = {
      //传统方式
      conf.set(TableInputFormat.SCAN_ROW_START, "startrowkey")
      conf.set(TableInputFormat.SCAN_ROW_STOP, "stoprowkey")
      conf.set(TableInputFormat.INPUT_TABLE, "SparkHbase")
      val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])



      //利用HbaseContext进行操作
      val SparkHbaseRDD=scanHbaseTB("SparkHbase")
      SparkHbaseRDD.foreach(x=>{
        val rowKey=x._1.toString
        val rs=x._2
        val cell=rs.getColumnLatestCell(Bytes.toBytes(""),Bytes.toBytes(""))
        println(s"the rowKey is $rowKey the values is $cell")
      })


    }


  }

你可能感兴趣的:(Spark,Hbase)