spark读写hbase基本操作

依赖包:



org.apache.spark
spark-sql_2.11
2.1.1

    
    
        org.apache.spark
        spark-core_2.11
        2.1.1
    

    
    
        org.apache.hbase
        hbase-server
        1.2.1
    


代码实现如下:
package com.ky.service

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Put, Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.hadoop.mapred.JobConf
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object HbaseSpark {

def main(args: Array[String]): Unit = {

Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().master("local[*]").appName(s"${this.getClass.getSimpleName}").getOrCreate()
val sc = spark.sparkContext

val table = "student"
val family = "info"
val column1 = "name"
val column2 = "age"

val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("spark.executor.memory", "3000m")
conf.set("hbase.zookeeper.quorum", "192.168.1.11:2181,192.168.1.12:2181")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.set(TableInputFormat.INPUT_TABLE, table)

val startRowkey = "1"
val endRowkey = "4"

val scan = new Scan(Bytes.toBytes(startRowkey), Bytes.toBytes(endRowkey))
scan.setCacheBlocks(false)

val proto = ProtobufUtil.toScan(scan)
val scanToString = Base64.encodeBytes(proto.toByteArray)
conf.set(TableInputFormat.SCAN, scanToString)

val hbaseRdd: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
  classOf[ImmutableBytesWritable],
  classOf[Result])
val counts = hbaseRdd.count()
println(counts)

//读取hbase信息,扫描信息
hbaseRdd.foreach { case (_, result) => {
  val key: String = Bytes.toString(result.getRow)
  val name = Bytes.toString(result.getValue(family.getBytes(), column1.getBytes()))
  val age = Bytes.toString(result.getValue(family.getBytes(), column2.getBytes()))
  println(s"the name is $name,the age is $age")
}
}

//写入数据到hbase
//初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!
val jobConf = new JobConf(conf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE, table)
val dataRdd = sc.makeRDD(Array("111,jack,15", "222,Lily,16", "333,mike,16"))
val rdd = dataRdd.map(_.split(',')).map(arr => {
  val put = new Put(Bytes.toBytes(arr(0)))
  put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(arr(1)))
  put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(arr(2)))
  (new ImmutableBytesWritable, put)
})
rdd.saveAsHadoopDataset(jobConf)

spark.stop()

}

}

你可能感兴趣的:(spark读写hbase基本操作)