Spark写数据到HBase

实现的功能:

从hive表中读取订单表,计算每个用户在30天内下单的金额,然后存到HBase

package pro3

import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.sql.{DataFrame, SparkSession}

import scala.util.Random
/*
将数据写入到hbase中
 */
object monthAmount {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
    val sc = new SparkContext(sparkConf)
    val spark2= SparkSession.builder()
      .appName("test1")
      .master("local[*]")
      //   .config("spark.sql.warehouse.dir","hdfs://mini1:9000/spark-warehouse")
      .config("spark.sql.warehouse.dir","hdfs://mini1:9000/user/hive/warehouse")
      .enableHiveSupport()
      .getOrCreate()

    spark2.sql("use pro3_dw")
    val df1: DataFrame = spark2.sql("" +
      " select userid,sum(order_amount) from dw_order2  where ctime>date_sub('2019-05-28',30) group by userid limit 4" )

    val rdd1 = df1.rdd
    val rdd2 = rdd1.map(x => {
      val uid: String = x.getString(0)
      val amount: Any = x.get(1)
      val put = new Put(Bytes.toBytes(uid))
      put.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("tag"), Bytes.toBytes("B220U015_001"))
      put.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("tagweight"), Bytes.toBytes(amount.toString))
      //   list.append(put)

      (new ImmutableBytesWritable, put)
    })




    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum","mini1:2181,mini2:2181,mini3:2181");
    conf.set("fs:defaultFS","hdfs://mini1:9000");

    val tableName = "userPhoto"

    val admin = new HBaseAdmin(conf)
    if(!admin.tableExists(TableName.valueOf(tableName))){
      println("不存在")
      val desc = new HTableDescriptor(TableName.valueOf(tableName))
     val hcd = new HColumnDescriptor("cf1")
      desc.addFamily(hcd)
      admin.createTable(desc)
    }

    //初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!
    val jobConf = new JobConf(conf)
    jobConf.setOutputFormat(classOf[TableOutputFormat])
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)

    rdd2.saveAsHadoopDataset(jobConf)

    sc.stop()
  }
}

 

你可能感兴趣的:(Spark,Hbase)