spark-core从hbase中读写数据

spark交互HBase

Spark可以从HBase表中读写(Read/Write)数据,底层采用 TableInputFormat 和 TableOutputFormat 方式,与MapReduce与HBase集成完全一样,使用相同输入格式InputFormat 和输出格式 OutputFoamt 。
spark-core从hbase中读写数据_第1张图片

1、写入数据

package com.yyds.tags.hbase.write


import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * @DESC: 将画像标签数据保存至HBase表
 */
object HBaseWriteTest {
  def main(args: Array[String]): Unit = {
    // a. 构建SparkContext实例对象
    val sparkConf = new SparkConf()
      .setAppName("SparkHBaseWrite")
      .setMaster("local[4]")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[ImmutableBytesWritable], classOf[Put])) // 注册哪些类型使用Kryo序列化, 最好注册RDD中类型
    val sc: SparkContext = new SparkContext(sparkConf)


    // b. 模拟数据集
    val tagsRDD: RDD[(String, String)] = sc.parallelize(
      List(("1001", "gender:男,job:教师"),
        ("1002", "gender:女,job:工人"),
        ("1003", "gender:男,job:学生"),
        ("1004", "gender:男,job:工人")
      ),
      numSlices = 2
    )


    // TODO:将RDD数据保存到HBase表中,要求RDD数据类型为二元组,Key: ImmutableBytesWritable, Value:Put
    /*
    HBase表:htb_tags
       RowKey:userId
       CF:user
       Column:tagName
    create 'htb_tags', 'user'
    */
    val datasRDD: RDD[(ImmutableBytesWritable, Put)] =
    tagsRDD.map { case (userId, tags) =>
      // a. 构建RowKey
      val rowKey: Array[Byte] = Bytes.toBytes(userId)
      // b. 构建put对象
      val put = new Put(rowKey)
      // 设置列
      put.addColumn(
        Bytes.toBytes("user"),
        Bytes.toBytes("userId"),
        Bytes.toBytes(userId)
      )
      tags.split(",").foreach { tag =>
        val Array(field, value) = tag.split(":")
        put.addColumn(
          Bytes.toBytes("user"),
          Bytes.toBytes(field),
          Bytes.toBytes(value)
        )
      }
      (new ImmutableBytesWritable(rowKey), put)
    }

    // 1. 设置HBase依赖Zookeeper相关配置信息
    val conf: Configuration = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum","192.168.42.7")
    conf.set("hbase.zookeeper.property.clientPort","2181")
    conf.set("zookeeper.znode.parent","/hbase")

    // 2. 数据写入表的名称
    conf.set(TableOutputFormat.OUTPUT_TABLE, "htb_tags")


    datasRDD.saveAsNewAPIHadoopFile(
      s"datas/hbase/output-${System.nanoTime()}",
      classOf[ImmutableBytesWritable],
      classOf[Put],
      classOf[TableOutputFormat[ImmutableBytesWritable]],
      conf
    )

    // 应用结束,关闭资源
    sc.stop()

  }
}

2、读取数据

package com.yyds.tags.hbase.read

import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration}
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/** *
 * 演示:SparkCore如何从HBase表读取数据
 */
object HBaseReadTest {

  def main(args: Array[String]): Unit = {
    // 创建SparkContext实例对象
    val sparkConf = new SparkConf()
      .setMaster("local[4]")
      .setAppName("HBaseReadTest")
      // 设置使用Kryo序列
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") // 注册哪些类型使用Kryo序列化, 最好注册RDD中类型
      .registerKryoClasses(Array(classOf[ImmutableBytesWritable], classOf[Result]))


    val sc: SparkContext = SparkContext.getOrCreate(sparkConf)
    // 读取数据
    /*def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
               conf: Configuration = hadoopConfiguration,
              fClass: Class[F], kClass: Class[K], vClass: Class[V]
        ): RDD[(K, V)]
   */
    // 1. 读取配置信息,加载HBaseClient配置(主要ZK地址和端口号)
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", "192.168.42.7")


    conf.set("hbase.zookeeper.property.clientPort", "2181")
    conf.set("zookeeper.znode.parent", "/hbase")

    // 2. 设置表的名称
    conf.set(TableInputFormat.INPUT_TABLE, "tbl_users")
    // 3. 从HBase表加载数据
    val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] =
      sc.newAPIHadoopRDD(conf,
        classOf[TableInputFormat],
        classOf[ImmutableBytesWritable],
        classOf[Result])
    println(s"count = ${hbaseRDD.count()}")
    hbaseRDD.take(2).foreach {
      case (_, result) => println(s"RowKey = ${Bytes.toString(result.getRow)}")
        for (cell <- result.rawCells()) {
          // 列簇CF
          val cf = Bytes.toString(CellUtil.cloneFamily(cell))
          // 列名称
          val column = Bytes.toString(CellUtil.cloneQualifier(cell))
          //列的值
          val value = Bytes.toString(CellUtil.cloneValue(cell))

          println(s"\t ${cf}:${column} = ${value}, version -> ${cell.getTimestamp}")
        }
    }
    // 应用结束,关闭资源
    sc.stop()

  }
}

你可能感兴趣的:(#,spark,spark,big,data)