spark rdd 读写hbase数据

hive 作为mapreduce计算引擎, 可以使用hql来操作hbase表 (hbase也可以访问hive表的数据), spark也是计算引擎,按理也是可以读写hbase数据的

使用idea + maven 读写hbase数据, 操作如下:

1, 添加maven依赖

 	
            org.apache.hbase
            hbase-client
            1.2.6
        
        
            org.apache.hbase
            hbase-server
            1.2.6
        

	
            org.apache.spark
            spark-sql_2.11
            2.1.0
        

2, spark读hbase表

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._

/**
  * Created by wang on 18-11-26.
  */
object Get {
  def main(args: Array[String]): Unit = {


    val sc_conf=new SparkConf().setMaster("local[2]").setAppName("读取Hbase中的数据")
    val sc=new SparkContext(sc_conf)

    //设置查询的表名
    val conf=HBaseConfiguration.create()
    conf.set(TableInputFormat.INPUT_TABLE,"ns1:person")
    val RDD=sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],classOf[ImmutableBytesWritable],classOf[Result])
   RDD.cache()

    val count=RDD.count()
    println("Students RDD Count:"+count)

    //遍历输出
    RDD.foreach({case (_,result)=>
      val key=Bytes.toString(result.getRow)
      val id=Bytes.toString(result.getValue("f1".getBytes,"id".getBytes))
      val name=Bytes.toString(result.getValue("f1".getBytes,"name".getBytes))
      val age=Bytes.toString(result.getValue("f1".getBytes,"age".getBytes))

      println("ROW:"+key+" id: "+id +" name: " +name+" Age: "+age)
    })
    
//   RDD.foreachPartition(iter => {
//      while (iter.hasNext) {
//        val tup = iter.next()
//        val result = tup._2
//
//        //遍历输出
//        // println(key + ", CREATETIME= " + CREATETIME ......
//      }
//    })//foreachPartiton
  }
}

3, spark写hbase表

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HTable, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}

object Write2 {
  def main(args: Array[String]): Unit = {
  
    val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local[*]")
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(sparkConf)

    //创建rdd
    val rdd = sc.makeRDD(1 to 10)

    //写入数据到hbase
    rdd.foreach(x => {

      //创建hbase连接
      val hbaseConf = HBaseConfiguration.create()
      hbaseConf.set("hbase.zookeeper.quorum", "localhost")
      hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")

      //表
      val t1 = new HTable(hbaseConf, "t1")
      t1.setAutoFlush(false)

      //写入数据
      val put = new Put(Bytes.toBytes("row" + x))
      put.add(Bytes.toBytes("f1"), Bytes.toBytes("num"), Bytes.toBytes(x))

      t1.put(put)
      t1.flushCommits
    })

    //关闭spark
    sc.stop()
  }
}

你可能感兴趣的:(大数据hadoop-spark)