spark读取hbae的Demo


import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.log4j.{Level, LogManager}
import org.apache.spark.{SparkConf, SparkContext};
/**
  * Created by owlcabin on 2016/5/27.
  */
object SparkHBase  {
  def main(args: Array[String]): Unit = {
    //Spark环境初始化
    val sparkConf = new SparkConf()
    val sparkContext = new SparkContext(sparkConf)
    LogManager.getRootLogger.setLevel(Level.WARN)
    val sqlContext = new org.apache.spark.sql.SQLContext(sparkContext)


    //通过zookeeper获取HBase连接
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
    hbaseConf.set("hbase.zookeeper.quorum", "dmp01,dmp02,dmp03,dmp04,dmp05")

    //设置读取表名
    hbaseConf.set(TableInputFormat.INPUT_TABLE, "t_prod_weixin_art")
    //设置读取列组
    hbaseConf.set(TableInputFormat.SCAN_COLUMNS, "info")
    //应用newAPIHadoopRDD读取HBase,返回NewHadoopRDD
    val hbaseRDD = sparkContext.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])


    //读取结果集RDD,返回一个MapPartitionsRDD
    val resRDD = hbaseRDD.map(tuple => tuple._2)

    //打印读取数据内容
    resRDD.map(r => (Bytes.toString(r.getRow),
      Bytes.toString(r.getValue(Bytes.toBytes("info"), Bytes.toBytes("content"))))).take(10).foreach(println(_))
  }

}


你可能感兴趣的:(spark)