方式一:
package com.bupt.spark.hbase
//1 table put
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
object SparkHbaseTablePut {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
val sc = new SparkContext(conf)
val ints = List(1,2,3,4,5,6,7,8,9,10)
val unit = sc.parallelize(ints,1)
unit.foreach(t => {
println(t)
val configuration = HBaseConfiguration.create()
val connection = ConnectionFactory.createConnection(configuration)
val table = connection.getTable(TableName.valueOf("student"))
val put = new Put(Bytes.toBytes("spark_"+ t))
put.addColumn(Bytes.toBytes("info"),Bytes.toBytes("count"),Bytes.toBytes(t.toString))
table.put(put)
table.close()
connection.close()
})
}
}
//方式二:
package com.bupt.spark.hbase
//2 table batch put
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
object SparkHbaseTableBatchPut {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
val sc = new SparkContext(conf)
val ints = List(1,2,3,4,5,6,7,8,9,10)
val unit = sc.parallelize(ints,2)
println(unit.partitions.length)
unit.foreachPartition(f => {
val list = f.toList
println(list)
val puts = new java.util.ArrayList[Put]()
for(next <- list){
val put = new Put(Bytes.toBytes("spark_batch"+next))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(next.toString))
puts.add(put)
}
val configuration = HBaseConfiguration.create()
val connection = ConnectionFactory.createConnection(configuration)
val table = connection.getTable(TableName.valueOf("student"))
table.put(puts)
table.close()
connection.close()
})
}
}
方式三:
package com.bupt.spark.hbase
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
object SparkHbaseTableOutPutFormatPut {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
val sc = new SparkContext(conf)
val ints = List(1,2,3,4,5,6,7,8,9,10)
val unit = sc.parallelize(ints,2)
val configuration = HBaseConfiguration.create()
configuration.set(TableOutputFormat.OUTPUT_TABLE, "student")
configuration.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]].getName)
configuration.set("mapreduce.job.output.key.class", classOf[ImmutableBytesWritable].getName)
configuration.set("mapreduce.job.output.value.class", classOf[Put].getName)
val unit1 = unit.map(t => {
val keyOut = new ImmutableBytesWritable()
val put = new Put(Bytes.toBytes("spark_tablePut" + t))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(t.toString))
keyOut.set(put.getRow)
(keyOut, put)
})
unit1.saveAsNewAPIHadoopDataset(configuration)
}
}
方式四:
package com.bupt.spark.hbase
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
object SparkHbasePutPartition {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
val sc = new SparkContext(conf)
val ints = List(1,2,3,4,5,6,7,8,9,10)
val unit = sc.parallelize(ints,2)
val configuration = HBaseConfiguration.create()
configuration.set(TableOutputFormat.OUTPUT_TABLE, "student")
configuration.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]].getName)
configuration.set("mapreduce.job.output.key.class", classOf[ImmutableBytesWritable].getName)
configuration.set("mapreduce.job.output.value.class", classOf[Put].getName)
println(unit.getNumPartitions)
var rdd = unit.repartition(5)
println(rdd.getNumPartitions)
val replication = rdd.mapPartitions(f => {
val list = f.toList
println(list)
import scala.collection.mutable.ListBuffer
val list1 = new ListBuffer[(ImmutableBytesWritable, Put)]
val writable = new ImmutableBytesWritable()
for(next <- list){
val put = new Put(Bytes.toBytes("spark_part"+next))
writable.set(put.getRow)
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(next.toString))
list1 += ((writable,put))
}
list1.toIterator
})
replication.saveAsNewAPIHadoopDataset(configuration)
}
}