HBase多表 bulkLoad

hbase 中快速导入大数据量可以使用bulkload 但是官方只有单表的load的相关案例. 经过查看一些源码MultiTableHFileOutputFormat可以支持多表bulkload

package com.cc.example;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.mapreduce.*;
import org.apache.hadoop.hbase.regionserver.BloomType;
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.BloomFilterUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.MapReduceExtendedCell;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;
import java.util.UUID;
import java.util.function.Function;

public class MultiTableHBaseBulkLoad {

    static class TableInfo {
        private TableDescriptor tableDesctiptor;
        private RegionLocator regionLocator;

        public TableInfo(TableDescriptor tableDesctiptor, RegionLocator regionLocator) {
            this.tableDesctiptor = tableDesctiptor;
            this.regionLocator = regionLocator;

         * The modification for the returned HTD doesn't affect the inner TD.
         * @return A clone of inner table descriptor
         * @see #getTableDescriptor()
         * @see HBASE-18241
         * @deprecated since 2.0.0 and will be removed in 3.0.0. Use {@link #getTableDescriptor()}
         * instead.
        public HTableDescriptor getHTableDescriptor() {
            return new HTableDescriptor(tableDesctiptor);

        public TableDescriptor getTableDescriptor() {
            return tableDesctiptor;

        public RegionLocator getRegionLocator() {
            return regionLocator;

    private static final Logger LOG = LoggerFactory.getLogger(MultiTableHBaseBulkLoad.class);

    public static final String LOCALITY_SENSITIVE_CONF_KEY =
    private static final boolean DEFAULT_LOCALITY_SENSITIVE = true;
    static final String OUTPUT_TABLE_NAME_CONF_KEY =

    // The following constants are private since these are used by
    // HFileOutputFormat2 to internally transfer data between job setup and
    // reducer run using conf.
    // These should not be changed by the client.
    static final String COMPRESSION_FAMILIES_CONF_KEY =
    static final String BLOOM_TYPE_FAMILIES_CONF_KEY =
    static final String BLOOM_PARAM_FAMILIES_CONF_KEY =
    static final String BLOCK_SIZE_FAMILIES_CONF_KEY =

    protected static final byte[] tableSeparator = Bytes.toBytes(";");

    static class HFileImportMapper2 extends Mapper {

        protected final String CF_KQ = "cf";

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String lineText = value.toString();
            String[] splitData = lineText.split(",");
            String tbName = splitData[0];
            String row = System.currentTimeMillis() + "_" + splitData[1];
            ImmutableBytesWritable rowkey = new ImmutableBytesWritable(Bytes.toBytes(tbName));
            KeyValue kv = new KeyValue(Bytes.toBytes(row), this.CF_KQ.getBytes(), splitData[2].getBytes(), splitData[3].getBytes());
            context.write(rowkey, kv);

    static class CellSortReducer2 extends Reducer {

        protected void reduce(ImmutableBytesWritable row, Iterable values, Context context) throws IOException, InterruptedException {
            TreeSet map = new TreeSet<>(CellComparator.getInstance());
            for (KeyValue kv : values) {
                try {
                } catch (CloneNotSupportedException e) {
                    throw new IOException(e);
            context.setStatus("Read " + map.getClass());
            int index = 0;
            for (KeyValue kv : map) {
                context.write(row, kv);
                if (++index % 100 == 0) {
                    context.setStatus("Wrote " + index);

    public static void main(String[] args) {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.fs.tmp.dir", "partitions_" + UUID.randomUUID());
        String input = "hdfs://nameservice2/tmp/person.txt";
        String output = "hdfs://nameservice2/tmp/pres";
        try {
            try {
                FileSystem fs = FileSystem.get(URI.create(output), conf);
                fs.delete(new Path(output), true);
            } catch (IOException e1) {

            Job job = Job.getInstance(conf);
            job.setJobName("HBase HFile");



            FileInputFormat.setInputPaths(job, input);
            FileOutputFormat.setOutputPath(job, new Path(output));

            TableName[] tableNames = new TableName[]{TableName.valueOf("default:t1"), TableName.valueOf("default:t2")};
            try (Connection conn = ConnectionFactory.createConnection(conf)) {
                List tableInfoList = new ArrayList<>();
                for (TableName tableName : tableNames) {
                    Table table = conn.getTable(tableName);
                    RegionLocator regionLocator = conn.getRegionLocator(tableName);
                    tableInfoList.add(new TableInfo(table.getDescriptor(), regionLocator));
                configureIncrementalLoad(job, tableInfoList, MultiTableHFileOutputFormat.class);

            boolean b = job.waitForCompletion(true);
            if (b) {
                for (TableName table : tableNames) {
                    org.apache.hadoop.hbase.tool.LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
                    Path tableOutputPath = new Path(new Path(output), table.getNameWithNamespaceInclAsString().replace(":", "/"));
                    org.apache.hadoop.hbase.client.Connection hbaseConn = ConnectionFactory.createConnection(conf);
                    Table htable = hbaseConn.getTable(TableName.valueOf(table.getNameWithNamespaceInclAsString()));
                    LOG.info("Loading HFiles for {} from {}", table.getNameWithNamespaceInclAsString(), tableOutputPath);
                    loader.doBulkLoad(tableOutputPath, hbaseConn.getAdmin(), htable, hbaseConn.getRegionLocator(TableName.valueOf(table.getNameWithNamespaceInclAsString())));
                    LOG.info("Incremental load complete for table=" + table.getNameWithNamespaceInclAsString());
        } catch (Exception e) {

    static void configureIncrementalLoad(Job job, List multiTableInfo,
                                         Class> cls) throws IOException {
        Configuration conf = job.getConfiguration();


        if (multiTableInfo.stream().distinct().count() != multiTableInfo.size()) {
            throw new IllegalArgumentException("Duplicate entries found in TableInfo argument");
        boolean writeMultipleTables = false;
        if (MultiTableHFileOutputFormat.class.equals(cls)) {
            writeMultipleTables = true;
            conf.setBoolean(MULTI_TABLE_HFILEOUTPUTFORMAT_CONF_KEY, true);
        // Based on the configured map output class, set the correct reducer to properly
        // sort the incoming values.
        // TODO it would be nice to pick one or the other of these formats.
        if (KeyValue.class.equals(job.getMapOutputValueClass())
                || MapReduceExtendedCell.class.equals(job.getMapOutputValueClass())) {
        } else if (Put.class.equals(job.getMapOutputValueClass())) {
        } else if (Text.class.equals(job.getMapOutputValueClass())) {
        } else {
            LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());

        conf.setStrings("io.serializations", conf.get("io.serializations"),
                MutationSerialization.class.getName(), ResultSerialization.class.getName(),

            LOG.info("bulkload locality sensitive enabled");

        /* Now get the region start keys for every table required */
        List allTableNames = new ArrayList<>(multiTableInfo.size());
        List regionLocators = new ArrayList<>(multiTableInfo.size());
        List tableDescriptors = new ArrayList<>(multiTableInfo.size());

        for (TableInfo tableInfo : multiTableInfo) {
            String tn = writeMultipleTables ?
                    tableInfo.getRegionLocator().getName().getNameWithNamespaceInclAsString() :
        // Record tablenames for creating writer by favored nodes, and decoding compression, block size and other attributes of columnfamily per table
        conf.set(OUTPUT_TABLE_NAME_CONF_KEY, StringUtils.join(allTableNames, Bytes
        List startKeys = getRegionStartKeys(regionLocators, writeMultipleTables);
        // Use table's region boundaries for TOP split points.
        LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
                "to match current region count for all tables");

        configurePartitioner(job, startKeys, writeMultipleTables);
        // Set compression algorithms based on column families

        conf.set(COMPRESSION_FAMILIES_CONF_KEY, serializeColumnFamilyAttribute(compressionDetails,
        conf.set(BLOCK_SIZE_FAMILIES_CONF_KEY, serializeColumnFamilyAttribute(blockSizeDetails,
        conf.set(BLOOM_TYPE_FAMILIES_CONF_KEY, serializeColumnFamilyAttribute(bloomTypeDetails,
        conf.set(BLOOM_PARAM_FAMILIES_CONF_KEY, serializeColumnFamilyAttribute(bloomParamDetails,
                serializeColumnFamilyAttribute(dataBlockEncodingDetails, tableDescriptors));

        LOG.info("Incremental output configured for tables: " + StringUtils.join(allTableNames, ","));

     * Configure job with a TotalOrderPartitioner, partitioning against
     * splitPoints. Cleans up the partitions file after job exists.
    static void configurePartitioner(Job job, List splitPoints, boolean
            throws IOException {
        Configuration conf = job.getConfiguration();
        // create the partitions file
        FileSystem fs = FileSystem.get(conf);
        String hbaseTmpFsDir =
                        fs.getHomeDirectory() + "/hbase-staging");
        Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID());
        writePartitions(conf, partitionsPath, splitPoints, writeMultipleTables);

        // configure job to use it
        TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);

     * Write out a {@link SequenceFile} that can be read by
     * {@link TotalOrderPartitioner} that contains the split points in startKeys.
    private static void writePartitions(Configuration conf, Path partitionsPath,
                                        List startKeys, boolean writeMultipleTables) throws IOException {
        LOG.info("Writing partition information to " + partitionsPath);
        if (startKeys.isEmpty()) {
            throw new IllegalArgumentException("No regions passed");

        // We're generating a list of split points, and we don't ever
        // have keys < the first region (which has an empty start key)
        // so we need to remove it. Otherwise we would end up with an
        // empty reducer with index 0
        TreeSet sorted = new TreeSet<>(startKeys);
        ImmutableBytesWritable first = sorted.first();
        if (writeMultipleTables) {
            first = new ImmutableBytesWritable(getSuffix(sorted.first
        if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
            throw new IllegalArgumentException(
                    "First region of table should have empty start key. Instead has: "
                            + Bytes.toStringBinary(first.get()));

        // Write the actual file
        FileSystem fs = partitionsPath.getFileSystem(conf);
        SequenceFile.Writer writer = SequenceFile.createWriter(
                fs, conf, partitionsPath, ImmutableBytesWritable.class,

        try {
            for (ImmutableBytesWritable startKey : sorted) {
                writer.append(startKey, NullWritable.get());
        } finally {

    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
    static String serializeColumnFamilyAttribute(Function fn, List allTables)
            throws UnsupportedEncodingException {
        StringBuilder attributeValue = new StringBuilder();
        int i = 0;
        for (TableDescriptor tableDescriptor : allTables) {
            if (tableDescriptor == null) {
                // could happen with mock table instance
                // CODEREVIEW: Can I set an empty string in conf if mock table instance?
                return "";
            for (ColumnFamilyDescriptor familyDescriptor : tableDescriptor.getColumnFamilies()) {
                if (i++ > 0) {
                        Bytes.toString(combineTableNameSuffix(tableDescriptor.getTableName().getName(), familyDescriptor.getName())),
                attributeValue.append(URLEncoder.encode(fn.apply(familyDescriptor), "UTF-8"));
        // Get rid of the last ampersand
        return attributeValue.toString();

     * Return the start keys of all of the regions in this table,
     * as a list of ImmutableBytesWritable.
    private static List getRegionStartKeys(List regionLocators,
                                                                   boolean writeMultipleTables)
            throws IOException {

        ArrayList ret = new ArrayList<>();
        for (RegionLocator regionLocator : regionLocators) {
            TableName tableName = regionLocator.getName();
            LOG.info("Looking up current regions for table " + tableName);
            byte[][] byteKeys = regionLocator.getStartKeys();
            for (byte[] byteKey : byteKeys) {
                byte[] fullKey = byteKey; //HFileOutputFormat2 use case
                if (writeMultipleTables) {
                    //MultiTableHFileOutputFormat use case
                    fullKey = combineTableNameSuffix(tableName.getName(), byteKey);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("SplitPoint startkey for table [" + tableName + "]: [" + Bytes.toStringBinary
                            (fullKey) + "]");
                ret.add(new ImmutableBytesWritable(fullKey));
        return ret;

    static Function compressionDetails = familyDescriptor ->

     * Serialize column family to block size map to configuration. Invoked while
     * configuring the MR job for incremental load.
    static Function blockSizeDetails = familyDescriptor -> String
     * Serialize column family to bloom type map to configuration. Invoked while
     * configuring the MR job for incremental load.
    static Function bloomTypeDetails = familyDescriptor -> {
        String bloomType = familyDescriptor.getBloomFilterType().toString();
        if (bloomType == null) {
            bloomType = ColumnFamilyDescriptorBuilder.DEFAULT_BLOOMFILTER.name();
        return bloomType;

     * Serialize column family to bloom param map to configuration. Invoked while
     * configuring the MR job for incremental load.
     * @param tableDescriptor
     * to read the properties from
     * @param conf
     * to persist serialized values into
     * @throws IOException
     * on failure to read column family descriptors
    static Function bloomParamDetails = familyDescriptor -> {
        BloomType bloomType = familyDescriptor.getBloomFilterType();
        String bloomParam = "";
        if (bloomType == BloomType.ROWPREFIX_FIXED_LENGTH) {
            bloomParam = familyDescriptor.getConfigurationValue(BloomFilterUtil.PREFIX_LENGTH_KEY);
        } else if (bloomType == BloomType.ROWPREFIX_DELIMITED) {
            bloomParam = familyDescriptor.getConfigurationValue(BloomFilterUtil.DELIMITER_KEY);
        return bloomParam;

    protected static byte[] combineTableNameSuffix(byte[] tableName, byte[] suffix) {
        return Bytes.add(tableName, tableSeparator, suffix);

     * Serialize column family to data block encoding map to configuration.
     * Invoked while configuring the MR job for incremental load.
    static Function dataBlockEncodingDetails = familyDescriptor -> {
        DataBlockEncoding encoding = familyDescriptor.getDataBlockEncoding();
        if (encoding == null) {
            encoding = DataBlockEncoding.NONE;
        return encoding.toString();

    final private static int validateCompositeKey(byte[] keyBytes) {

        int separatorIdx = Bytes.indexOf(keyBytes, tableSeparator);

        // Either the separator was not found or a tablename wasn't present or a key wasn't present
        if (separatorIdx == -1) {
            throw new IllegalArgumentException("Invalid format for composite key [" + Bytes
                    .toStringBinary(keyBytes) + "]. Cannot extract tablename and suffix from key");
        return separatorIdx;

    protected static byte[] getSuffix(byte[] keyBytes) {
        int separatorIdx = validateCompositeKey(keyBytes);
        return Bytes.copy(keyBytes, separatorIdx + 1, keyBytes.length - separatorIdx - 1);

因为是cdh6.3.2 的环境MapReduce需要加载变量

# CDH 中MR执行需要一些环境变量等.
1. export HBASE_HOME=/opt/cloudera/parcels/CDH/lib/hbase
2. export HADOOP_CLASSPATH=$HBASE_HOME/lib/*:classpath:/opt/phoenix-bulkload/hbase-conf
3. hadoop jar hbase-multitable-1.0.0.jar com.nvr.example.HBaseBulkLoad




create 't1','cf'
create 't2','cf'
