spark sql 操作hbase表

在hbase表 [ns1:person] 中有如下数据

hbase(main):073:0> scan 'ns1:person'
ROW                                  COLUMN+CELL     
                                                                                          
 1543311852355person-=>Info          column=f1:ROWKEY, timestamp=1543311854753, value=123456                                                   
 1543311852355person-=>Info          column=f1:age, timestamp=1543311854753, value=12                                                          
 1543312440228person=>Info           column=f1:ROWKEY, timestamp=1543312441472, value=123456                                                   
 1543312440228person=>Info           column=f1:age, timestamp=1543312441472, value=12                                                          
 row2                                column=f1:age, timestamp=1543315851136, value=23                                                          
 row2                                column=f1:id, timestamp=1543315836448, value=2                                                            
 row2                                column=f1:name, timestamp=1543315842990, value=lisi          
                                              
3 row(s) in 0.0410 seconds

需求为: 在spark sql中操作hbase表, 如何实现?

1, 开发环境: idea + maven

	
            org.apache.hbase
            hbase-client
            1.2.6
        
        
            org.apache.hbase
            hbase-server
            1.2.6
        
        
            org.apache.spark
            spark-sql_2.11
            2.1.0
        

2, 实现代码

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._
import org.apache.spark.sql.SparkSession

/**
  * Created by wang on 18-11-26.
  */
object Get3 {

   //表数据<-->类对象的映射
  case class Per(val id: Integer, val name: String, val age: Integer)
  
  def main(args: Array[String]): Unit = {
    val sc_conf = new SparkConf().setMaster("local[2]").setAppName("读取Hbase中的数据")
    val spark = SparkSession.builder().config(sc_conf).getOrCreate()

    //设置查询的表名
    val conf = HBaseConfiguration.create()
    conf.set(TableInputFormat.INPUT_TABLE, "ns1:person")
    val RDD = spark.sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
    RDD.cache()


    //转化表数据-->为对象
    val rdd2 = RDD.map({
      case (_, result) =>

        var id = -1
        var name = "null"
        var age = -1

        try {
          id = Bytes.toString(result.getValue("f1".getBytes, "id".getBytes)).toInt
          name = Bytes.toString(result.getValue("f1".getBytes, "name".getBytes))
          age = Bytes.toString(result.getValue("f1".getBytes, "age".getBytes)).toInt
        } catch {
          case _ => print("error..")
        }

        Per(id, name, age)
    })


    //rddd转 DataFrame
    import spark.implicits._
    val df = rdd2.toDF()
    df.createTempView("table1")

    spark.sql("select id ,name ,age from table1" +
      " where id !=-1 " +
       "and name!='null'")
      .show()
  }
}

你可能感兴趣的:(大数据hadoop-spark)