spark写入Hbase

方式一:

package com.bupt.spark.hbase
//1 table put
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}


object SparkHbaseTablePut {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,1)
    unit.foreach(t => {
      println(t)
      val configuration = HBaseConfiguration.create()
      val connection = ConnectionFactory.createConnection(configuration)
      val table = connection.getTable(TableName.valueOf("student"))
      val put = new Put(Bytes.toBytes("spark_"+ t))
      put.addColumn(Bytes.toBytes("info"),Bytes.toBytes("count"),Bytes.toBytes(t.toString))
      table.put(put)
      table.close()
      connection.close()
    })


  }
}

//方式二:

package com.bupt.spark.hbase
//2 table batch put
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}

object SparkHbaseTableBatchPut {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,2)
    println(unit.partitions.length)
    unit.foreachPartition(f => {
      val list = f.toList
      println(list)
      val puts = new java.util.ArrayList[Put]()
      for(next <- list){
        val put = new Put(Bytes.toBytes("spark_batch"+next))
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(next.toString))
        puts.add(put)
      }
      val configuration = HBaseConfiguration.create()
      val connection = ConnectionFactory.createConnection(configuration)
      val table = connection.getTable(TableName.valueOf("student"))
      table.put(puts)
      table.close()
      connection.close()
    })
  }
}

方式三:

package com.bupt.spark.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat

object SparkHbaseTableOutPutFormatPut {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,2)
    val configuration = HBaseConfiguration.create()
    configuration.set(TableOutputFormat.OUTPUT_TABLE, "student")
    configuration.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]].getName)
    configuration.set("mapreduce.job.output.key.class", classOf[ImmutableBytesWritable].getName)
    configuration.set("mapreduce.job.output.value.class", classOf[Put].getName)
    val unit1 = unit.map(t => {
      val keyOut = new ImmutableBytesWritable()
      val put = new Put(Bytes.toBytes("spark_tablePut" + t))
      put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(t.toString))
      keyOut.set(put.getRow)
      (keyOut, put)
    })
    unit1.saveAsNewAPIHadoopDataset(configuration)
  }
}

方式四:

package com.bupt.spark.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}

object SparkHbasePutPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,2)
    val configuration = HBaseConfiguration.create()
    configuration.set(TableOutputFormat.OUTPUT_TABLE, "student")
    configuration.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]].getName)
    configuration.set("mapreduce.job.output.key.class", classOf[ImmutableBytesWritable].getName)
    configuration.set("mapreduce.job.output.value.class", classOf[Put].getName)

    println(unit.getNumPartitions)
    var rdd = unit.repartition(5)
    println(rdd.getNumPartitions)

    val replication = rdd.mapPartitions(f => {
      val list = f.toList
      println(list)
     import scala.collection.mutable.ListBuffer
      val list1 = new ListBuffer[(ImmutableBytesWritable, Put)]
      val writable = new ImmutableBytesWritable()
      for(next <- list){
        val put = new Put(Bytes.toBytes("spark_part"+next))
        writable.set(put.getRow)
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(next.toString))
        list1 += ((writable,put))
      }
      list1.toIterator
    })
    replication.saveAsNewAPIHadoopDataset(configuration)


  }
}

你可能感兴趣的:(spark,hbase)