Scala(spark)读写Hbase示例

由于网上找到的版本都比较老旧,记录一版现在在用的版本的Scala读写Hbase示例。Scala2.11.8;Spark2.1.0。仅在本机集群通过,供参考。

package test

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.SparkSession

object TestHBase {
  def main(args: Array[String]) {

    val spark = SparkSession.builder().appName("LinkStart").master("local").getOrCreate()
    val sc = spark.sparkContext
    val conf = HBaseConfiguration.create()
    //设置zookeeper连接端口,默认2181
    conf.set("hbase.zookeeper.quorum", "集群地址ip,逗号分隔") // HBase集群服务器地址(任一台)
    conf.set("hbase.zookeeper.property.clientPort", "2181") // zookeeper客户端访问端口
    conf.set("hbase.master", "master:port")
    //设置查询的表名
    conf.set(TableInputFormat.INPUT_TABLE, "test2019:bulletin")
    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])
    val count = hBaseRDD.count()
    println("Students RDD Count:" + count)
    hBaseRDD.cache()

    //遍历输出
    hBaseRDD.foreach({ case (_,result) =>
      val key = Bytes.toString(result.getRow)
      val oldData = Bytes.toString(result.getValue("docs".getBytes,"insert_time".getBytes))
      val newData = Bytes.toString(result.getValue("docs".getBytes,"latest".getBytes))
      println("Row key:"+key+" OLD:"+oldData+" NEW:"+newData)
    })

    
    val tablename = "test2019:bull"
    sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)

    val job = new Job(sc.hadoopConfiguration)
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setOutputValueClass(classOf[Result])
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])

    val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录
    val rdd = indataRDD.map(_.split(',')).map{arr=>{
      val put = new Put(Bytes.toBytes(arr(0))) //行健的值
      put.add(Bytes.toBytes("docs"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))  //info:name列的值
      put.add(Bytes.toBytes("docs"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2)))  //info:gender列的值
      put.add(Bytes.toBytes("docs"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt))  //info:age列的值
      (new ImmutableBytesWritable, put)
    }}
    rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
  }

}

Maven配置文件如下:



    4.0.0

    com.haizhi.data
    DataScala
    1.0-SNAPSHOT

    
        2.1.0
        2.11
        2.5.0
        1.2.0
    

    
        
            org.mongodb.scala
            mongo-scala-driver_2.11
            2.6.0
        
        
            net.minidev
            json-smart
            2.3
        
        
            org.apache.spark
            spark-core_${scala.version}
            ${spark.version}
        
        
            org.apache.spark
            spark-streaming_${scala.version}
            ${spark.version}
        
        
            org.apache.spark
            spark-sql_${scala.version}
            ${spark.version}
        
        
            org.apache.spark
            spark-hive_${scala.version}
            ${spark.version}
        
        
            org.apache.spark
            spark-mllib_${scala.version}
            ${spark.version}
        

        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        

        
            org.apache.hadoop
            hadoop-common
            ${hadoop.version}
        

        
            org.apache.hadoop
            hadoop-hdfs
            ${hadoop.version}
        
        
            org.apache.hbase
            hbase-client
            ${hbase.version}
        
        
            org.apache.hbase
            hbase-server
            ${hbase.version}
        

        
            org.apache.hbase
            hbase
            ${hbase.version}
            pom
        
    

    
        

            
                org.scala-tools
                maven-scala-plugin
                2.15.2
                
                    
                        
                            compile
                            testCompile
                        
                    
                
            

            
                maven-compiler-plugin
                3.6.0
                
                    1.8
                    1.8
                
            

            
                org.apache.maven.plugins
                maven-surefire-plugin
                2.19
                
                    true
                
            

        
    

你可能感兴趣的:(Spark学习记录)