spark读写hbase数据

如果原始数据在hbase中,这时想用spark对hbase数据做一些批量计算,就可以用spark的api直接读写hbase数据

读取hbase数据

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._


object HbaseSparkRead {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
    val sc = new SparkContext(sparkConf)

    //hbase信息
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", "ht05")
    conf.set("hbase.zookeeper.property.clientPort", "2181")
    conf.set(TableInputFormat.INPUT_TABLE, "spark_hbase")


    //读取数据并转化成rdd
    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    //val count = hBaseRDD.count()
    //println(count)
    hBaseRDD.foreach { case (_, result) => {
      //获取行键
      val key = Bytes.toString(result.getRow)
      //通过列族和列名获取列
      val name = Bytes.toString(result.getValue("cf".getBytes, "name".getBytes))
      println("Row key:" + key + " Name:" + name)
    }
    }

    //将hbase数据保存到txt
    hBaseRDD.map(x => Bytes.toString(x._2.getRow)).saveAsTextFile("hdfs://ht05:9000/test1")
  }
}

写入hbase数据

import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark._

object HbaseSparkWrite {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
    val sc = new SparkContext(sparkConf)

    //从txt写入到hbase
    val dataRdd = sc.textFile("hdfs://ht05:9000//zhaow/hotle0.txt")

    //hbase信息
    sc.hadoopConfiguration.set("hbase.zookeeper.quorum", "ht05")
    sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
    sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, "spark_test0")

    //lazy,延迟加载,如果程序在spark-shell里面运行必须使用延迟加载,因为spark-shell里面每一部都会打印结果
    lazy val job = new Job(sc.hadoopConfiguration)
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setOutputValueClass(classOf[Result])

    val rdd = dataRdd.filter(_.length > 0).map { line => {
      val rowkey: String = line
      val put = new Put(Bytes.toBytes(rowkey))
      put.add(Bytes.toBytes("cf"), Bytes.toBytes("name"), Bytes.toBytes(rowkey))
      (new ImmutableBytesWritable, put)
    }
    }
    rdd.saveAsNewAPIHadoopDataset(job.getConfiguration)
  }
}

你可能感兴趣的:(spark,hbase)