数据量预估,预分裂 ——> 准备HBase表 ——> Spark加载HDFS上的数据 ——> 数据清洗及排序 ——> 数据以HFile的形式写入HDFS ——> BulkLoad ——> 优化
未优化时,大概1200万条数据/h (10G数据)
参考了很多资料后,猜测使用Put写入,也是Bulk Load,并不只是KeyValue才能批量加载
import util.HdfsUtils
import org.apache.log4j.Logger
import org.slf4j.LoggerFactory
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.Level
import org.apache.spark.{SparkConf, SparkContext}
import scala.util.matching.Regex
* write data to HBase by 'Put'
* Date: 2019-04-29
* Time: 17:13
* Author: wh
* Version: V1.0.0
class HBaseTest {
object CleanToHBase {
private val PARTTERN: Regex = """。。。。。。""".r
private val LOG = LoggerFactory.getLogger(classOf[HBaseTest])
private val HdfsFilePath = HdfsUtils.HDFS_SCHEME + "。。。。。。"
private final val NULL_FIELDS = Array("-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-")
private val NUM_FIELDS: Int = 17
* 解析输入的日志数据
* @param line logline
* @return
def logLineSplit(line: String): Array[String] = {
val options = PARTTERN.findFirstMatchIn(line)
var fileds = new Array[String](NUM_FIELDS)
// 。。。。。。清洗逻辑
def main(args: Array[String]): Unit = {
val startTime: Long = System.currentTimeMillis()
// 1. Spark清洗
val sparkConf = new SparkConf().setAppName("Put to HBase test").setMaster("local")
val sc = new SparkContext(sparkConf)
var logRDD = sc.textFile(HdfsFilePath, 12)
val splitRDD = logRDD.map(line => logLineSplit(line))
// 2. HBase 信息
val tableName = "bdTest2"
val familyName = Bytes.toBytes("infos")
// 3. HBase MapReduce Bulk Job
sc.hadoopConfiguration.set("hbase.zookeeper.quorum", "cluster")
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tableName)
val hbaseBulkJob = Job.getInstance(sc.hadoopConfiguration)
var i = 0
// 4. write data to HBase
val hbasePuts = splitRDD.map{ line =>
val put = new Put(Bytes.toBytes("row-" + System.nanoTime())) // 测试用
put.addColumn(familyName, Bytes.toBytes("column name"), Bytes.toBytes(line(1)))
put.addColumn(familyName, Bytes.toBytes("column name"), Bytes.toBytes(line(2)))
// 。。。。。。other column
(new ImmutableBytesWritable(), put)
LOG.info("Time elapsed {} seconds.", (System.currentTimeMillis() - startTime) / 1000)
package core;
* It's not work, for List to Cell, see {@code BulkLoadToHBase.scala}
* Date: 2019-04-30
* Time: 下午3:38
* Author: wh
* Version: V1.0.0
import util.HdfsUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CleanToHBase {
private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(CleanToHBase.class);
public static final int NUM_FIELDS = 17;
// public static long CURSOR = 0; TODO 累加器
private static final String LOG_ENTRY_PATTERN = "。。。。。。";
private static final Pattern PATTERN = Pattern.compile(LOG_ENTRY_PATTERN);
private static final String HdfsFilePath = HdfsUtils.HDFS_SCHEME + "。。。。。。";
private static final String TABLE_NAME = "tableName";
public static String[] logLineSplit(String line) {
Matcher matcher = PATTERN.matcher(line);
String[] fileds = new String[NUM_FIELDS];
int i = 0;
if (!matcher.matches() || NUM_FIELDS != matcher.groupCount()) {
LOG.error("Bad options or bad length =============》");
for (String s : fileds) fileds[i++] = "-";
return fileds;
for (; i < NUM_FIELDS; i++) {
fileds[i] = matcher.group(i);
return fileds;
public static void main(String[] args) throws IOException {
long startTime = System.currentTimeMillis();
System.setProperty("HADOOP_USER_NAME", "hdfs");
// 1. Spark清洗
SparkConf conf = new SparkConf().setAppName("Log clean to HBase").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile(HdfsFilePath);
JavaRDD<String[]> filedsRDD = lines.map(CleanToHBase::logLineSplit);
// 2. HBase 信息
TableName tableName = TableName.valueOf(TABLE_NAME);
byte[] familyName = Bytes.toBytes("infos");
// 3. HBase MapReduce Bulk Load Job
Configuration hbConf = HBaseConfiguration.create();
hbConf.set("hbase.zookeeper.quorum", "cluster");
hbConf.set("hbase.zookeeper.property.clientPort", "2181");
hbConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE_NAME);
Connection connection = ConnectionFactory.createConnection(hbConf);
Table table = connection.getTable(tableName);
Job hbaseBulkJob = Job.getInstance(hbConf);
// hbaseBulkJob.setMapOutputKeyClass(ImmutableBytesWritable.class);
// hbaseBulkJob.setMapOutputValueClass(Put.class);
// hbaseBulkJob.setOutputFormatClass(HFileOutputFormat2.class);
// HFile 设置
RegionLocator regionLocator = connection.getRegionLocator(tableName);
HFileOutputFormat2.configureIncrementalLoad(hbaseBulkJob, table, regionLocator);
// 4. Prepare for HFile Put
JavaPairRDD<ImmutableBytesWritable, List<Tuple2<ImmutableBytesWritable, KeyValue>>> hbasePutsRDD = filedsRDD.mapToPair((PairFunction<String[], ImmutableBytesWritable, List<Tuple2<ImmutableBytesWritable, KeyValue>>>) line -> {
// row key: TODO 优化
List<Tuple2<ImmutableBytesWritable, KeyValue>> keyValueList = new ArrayList<>();
byte[] rowkey = Bytes.toBytes(line[0].hashCode() + String.valueOf(System.currentTimeMillis()));
ImmutableBytesWritable writable = new ImmutableBytesWritable(rowkey);
keyValueList.add(new Tuple2<>(writable, new KeyValue(rowkey, familyName, Bytes.toBytes("column name"), Bytes.toBytes(line[0]))));
keyValueList.add(new Tuple2<>(writable, new KeyValue(rowkey, familyName, Bytes.toBytes("column name"), Bytes.toBytes(line[1]))));
// other column
// TODO List to Cell and sort
return new Tuple2<>(writable, keyValueList);
// 5. store HFile
String temp = "/tmp/hbase/" + TABLE_NAME + "_" + System.currentTimeMillis();
hbasePutsRDD.saveAsNewAPIHadoopFile(temp, ImmutableBytesWritable.class, List.class, HFileOutputFormat2.class, hbaseBulkJob.getConfiguration());
LoadIncrementalHFiles bulkLoader = new LoadIncrementalHFiles(hbConf);
Admin admin = connection.getAdmin();
// 6. Bulk load to HBase
bulkLoader.doBulkLoad(new Path(temp), admin, table, regionLocator);
LOG.info("Done. Time elapsed {} seconds.", (System.currentTimeMillis() - startTime) / 1000);
package core
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import org.slf4j.LoggerFactory
import util.HdfsUtils
import scala.util.matching.Regex
* Date: 2019-05-05
* Time: 10:54
* Author: wh
* Version: V1.0.0
class BulkLoadToHBase {
object BulkLoad {
private val PARTTERN: Regex = """......""".r
private val LOG = LoggerFactory.getLogger(classOf[BulkLoadToHBase])
private val NUM_FIELDS: Int = 17
private val MODULUS: Int = 5
private final val NULL_FIELDS = ("-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-")
private final val COL_NAME = Array(......)
private val HdfsFilePath = HdfsUtils.HDFS_SCHEME + "......"
private val TABLE_NAME = "table name"
private val FAMILY_NAME = "infos".getBytes()
* 正则拆解输入的日志数据
* @param line logline
* @return
def logLineSplit(line: String): (String, String, String, String, String, String, String, String, String, String, String, String, String, String, String, String, String) = {
val options = PARTTERN.findFirstMatchIn(line)
// 匹配失败
if (options.isEmpty) {
LOG.error("Bad log, no options =============》")
else {
val m = options.get
if (NUM_FIELDS != m.groupCount) {
LOG.error("Bad length {} =============》", m.groupCount)
else {
(m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8), m.group(9), m.group(10), m.group(11), m.group(12), m.group(13), m.group(14), m.group(15), m.group(16), m.group(17))
def salt(key: String, modulus: Int): String = {
val saltAsInt = Math.abs(key.hashCode) % modulus
// left pad with 0's (for readability of keys)
val charsInSalt = digitsRequired(modulus)
("%0" + charsInSalt + "d").format(saltAsInt) + ":" + key
// number of characters required to encode the modulus in chars (01,02.. etc)
def digitsRequired(modulus: Int): Int = {
(Math.log10(modulus - 1) + 1).asInstanceOf[Int]
// A partitioner that puts data destined for the same HBase region together
class SaltPrefixPartitioner[K, V](modulus: Int) extends Partitioner {
val charsInSalt: Int = digitsRequired(modulus)
override def getPartition(key: Any): Int = {
key.toString.substring(0, charsInSalt).toInt
override def numPartitions: Int = modulus
def main(args: Array[String]): Unit = {
System.setProperty("user.name", "hdfs")
System.setProperty("HADOOP_USER_NAME", "hdfs")
val startTime: Long = System.currentTimeMillis()
// 1. data clean by Spark
val sparkConf = new SparkConf().setAppName("Log clean to HBase").setMaster("local")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
var logRDD = sc.textFile(HdfsFilePath, 24)
val splitRDD = logRDD.map(line => logLineSplit(line))
// prepare for KeyValue
val beforeCellsRDD = splitRDD.flatMap(x => {
val rowKey = salt(x._1, MODULUS)
for (i <- 0 until NUM_FIELDS) yield {
val colName = COL_NAME(i)
val colValue = x.productElement(i)
(rowKey, (colName, colValue))
// cells of data for HBase
val cellsRDD = beforeCellsRDD.map(x => {
val rowKey = x._1.getBytes()
val kv = new KeyValue(
(new ImmutableBytesWritable(rowKey), kv)
// repartition and sort the data - HFiles want sorted data
val partitionedRDD = cellsRDD.repartitionAndSortWithinPartitions(new SaltPrefixPartitioner(MODULUS))
// 2. HBase MapReduce Bulk Load Job
val hbConf = HBaseConfiguration.create()
hbConf.set("hbase.zookeeper.quorum", "node5,node6,node7,node8")
hbConf.set("hbase.zookeeper.property.clientPort", "2181")
hbConf.set("hbase.mapreduce.hfileoutputformat.table.name", TABLE_NAME)
val connection = ConnectionFactory.createConnection(hbConf)
val tableName = TableName.valueOf(TABLE_NAME)
val table = connection.getTable(tableName)
val hbaseBulkJob = Job.getInstance(hbConf, "HFile Bulk Load Job")
// 3. HFile config, Bulk load to HBase directly
val regionLocator = connection.getRegionLocator(tableName)
HFileOutputFormat2.configureIncrementalLoad(hbaseBulkJob, table.getDescriptor, regionLocator)
// hbaseBulkJob.setJarByClass(classOf[JavaCleanToHBase])
val hfileOutPath = "/tmp/hbase/" + TABLE_NAME + "_" + System.currentTimeMillis()
hbaseBulkJob.getConfiguration.set("mapred.output.dir", hfileOutPath)
// partitionedRDD.saveAsNewAPIHadoopFile(
// hfileOutPath,
// classOf[ImmutableBytesWritable],
// classOf[Put],
// classOf[HFileOutputFormat2],
// hbaseBulkJob.getConfiguration)
// 4. Bulk load to HBase
// val bulkLoader = new LoadIncrementalHFiles(hbConf)
// val admin = connection.getAdmin
// bulkLoader.doBulkLoad(new Path(hfileOutPath), admin, table, regionLocator)
LOG.info("Done. Time elapsed {} seconds.", (System.currentTimeMillis - startTime) / 1000)
package core
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import org.slf4j.LoggerFactory
import util.HdfsUtils
import scala.util.matching.Regex
* Date: 2019-05-06
* Time: 15:33
* Author: wh
* Version: V1.0.0
class BulkLoadToHBasePut {
object BulkLoadPut {
private val PARTTERN: Regex = """......""".r
private val LOG = LoggerFactory.getLogger(classOf[BulkLoadToHBase])
private val NUM_FIELDS: Int = 17
private val MODULUS: Int = 5
private final val NULL_FIELDS = ("-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-")
private final val COL_NAME = Array(......)
private val HdfsFilePath = HdfsUtils.HDFS_SCHEME + "......"
private val TABLE_NAME = "table name"
private val FAMILY_NAME = "infos".getBytes()
* 正则拆解输入的日志数据
* @param line logline
* @return
def logLineSplit(line: String): (String, String, String, String, String, String, String, String, String, String, String, String, String, String, String, String, String) = {
val options = PARTTERN.findFirstMatchIn(line)
// 匹配失败
if (options.isEmpty) {
LOG.error("Bad log, no options =============》")
else {
val m = options.get
if (NUM_FIELDS != m.groupCount) {
LOG.error("Bad length {} =============》", m.groupCount)
else {
(m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8), m.group(9), m.group(10), m.group(11), m.group(12), m.group(13), m.group(14), m.group(15), m.group(16), m.group(17))
def salt(key: String, modulus: Int): String = {
val saltAsInt = Math.abs(key.hashCode) % modulus
// left pad with 0's (for readability of keys)
val charsInSalt = digitsRequired(modulus)
("%0" + charsInSalt + "d").format(saltAsInt) + ":" + key + ":" + System.nanoTime().toString.substring(8, 13)
// number of characters required to encode the modulus in chars (01,02.. etc)
def digitsRequired(modulus: Int): Int = {
(Math.log10(modulus - 1) + 1).asInstanceOf[Int]
// A partitioner that puts data destined for the same HBase region together
class SaltPrefixPartitioner[K, V](modulus: Int) extends Partitioner {
val charsInSalt: Int = digitsRequired(modulus)
override def getPartition(key: Any): Int = {
key.toString.substring(0, charsInSalt).toInt
override def numPartitions: Int = modulus
def main(args: Array[String]): Unit = {
System.setProperty("user.name", "hdfs")
System.setProperty("HADOOP_USER_NAME", "hdfs")
val startTime: Long = System.currentTimeMillis()
// 1. data clean by Spark
val sparkConf = new SparkConf().setAppName("Log clean to HBase").setMaster("local")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
var logRDD = sc.textFile(HdfsFilePath, 24)
val splitRDD = logRDD.map(line => logLineSplit(line))
val putsRDD = splitRDD.map{ line =>
val rowKey = salt(line._1, MODULUS).getBytes()
val put = new Put(rowKey)
for (i <- 0 until NUM_FIELDS) {
val colName = COL_NAME(i).getBytes()
val colValue = line.productElement(i).toString.getBytes()
put.addColumn(FAMILY_NAME, colName, colValue)
(new ImmutableBytesWritable(rowKey), put)
val partitionedRDD = putsRDD.repartitionAndSortWithinPartitions(new SaltPrefixPartitioner(MODULUS))
// 2. HBase MapReduce Bulk Load Job
sc.hadoopConfiguration.set("hbase.zookeeper.quorum", "node5,node6,node7,node8")
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, TABLE_NAME)
val hbaseBulkJob = Job.getInstance(sc.hadoopConfiguration)
val hfileOutPath = "/tmp/hbase/" + TABLE_NAME + "_" + System.currentTimeMillis()
hbaseBulkJob.getConfiguration.set("mapred.output.dir", hfileOutPath)
// partitionedRDD.saveAsNewAPIHadoopFile(
// hfileOutPath,
// classOf[ImmutableBytesWritable],
// classOf[Put],
// classOf[HFileOutputFormat2],
// hbaseBulkJob.getConfiguration)
// 4. Bulk load to HBase
// val bulkLoader = new LoadIncrementalHFiles(hbConf)
// val admin = connection.getAdmin
// bulkLoader.doBulkLoad(new Path(hfileOutPath), admin, table, regionLocator)
LOG.info("Done. Time elapsed {} seconds.", (System.currentTimeMillis - startTime) / 1000)
,频繁的region split会消耗宝贵的集群I/O资源估算分区数量
// A partitioner that puts data destined for the same HBase region together
class SaltPrefixPartitioner[K,V](modulus: Int) extends Partitioner {
val charsInSalt: Int = digitsRequired(modulus)
override def getPartition(key: Any): Int = {
override def numPartitions: Int = modulus
hbConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE_NAME)
问题:Added a key not lexically larger than previous
问题: object not serializable (class: org.apache.hadoop.hbase.io.ImmutableBytesWritable
val sparkConf = new SparkConf().setAppName("Log clean to HBase").setMaster("local")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
Added a key not lexically larger than previous,查看日志,发觉重复写入数据
BulkLoad 1
BulkLoad 2
BulkLoad 3
BulkLoad 4
3 Steps for Bulk Loading 1M Records in 20 Seconds Into Apache Phoenix
Efficient bulk load of HBase using Spark
BulkLoad 代码借鉴
HBase clientAPI基本操作
object not serializable (class: org.apache.hadoop.hbase.io.ImmutableBytesWritable)