自定义Java MapReduce操作HBase数据库导入数据的两种方式

 自定义实现将一张表中的数据读出,处理后存入到另外一张表中

以下操作需要预先在HBase中创建目标表



import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class F_SaleOrdersMapReducer extends Configured implements Tool {
    private final static String ORDERS_TABLE_NAME="ns1:orders";
    private final static String HISTORY_ORDERS_TABLE_NAME="orders:history_orders88";


    static class ReadOrderMapper extends TableMapper{
        private final static String ORDER_COLUMN_NAME_USER_ID = "user_id";
        private final static String ORDER_COLUMN_NAME_ORDER_ID = "order_id";
        private final static String ORDER_COLUMN_NAME_DATE = "date";
        private final static String HISTORY_ROW_KEY_SEPARATOR = "_";
        private final static byte[] HISTORY_COLUMN_FAMILY= Bytes.toBytes( "order" );

        private ImmutableBytesWritable mapOutput = new ImmutableBytesWritable(  );
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

            //编写专门的方法,转换数据,得到Put对象
            Put put = resultToPut(key,value);

            //输出rowKey
            mapOutput.set( put.getRow() );
            //输出
            context.write( mapOutput,put );
        }

        private Put resultToPut(ImmutableBytesWritable key, Result result) {
            //订单Id
            String orderId = Bytes.toString( key.get() );
            //date,user_id,order_amt
            HashMap orderMap = new HashMap<>();
            for (Cell cell:result.rawCells()) {
                String filed = Bytes.toString(CellUtil.cloneQualifier( cell ));
                String value = Bytes.toString(CellUtil.cloneValue( cell ));
                orderMap.put( filed ,value);
            }

            //组合rowKey:userId + orderDate + orderId
            StringBuffer sb = new StringBuffer();
            //reverse(userId)
            sb.append( orderMap.get( ORDER_COLUMN_NAME_USER_ID ) ).reverse();
            sb.append( HISTORY_ROW_KEY_SEPARATOR );
            //date
            sb.append( orderMap.get( ORDER_COLUMN_NAME_DATE )  );
            sb.append( HISTORY_ROW_KEY_SEPARATOR );
            sb.append( orderId );

            //创建Put对象
            Put put = new Put(Bytes.toBytes( sb.toString() ));
            for (Map.Entry entry:orderMap.entrySet()) {
                put.addColumn(
                        HISTORY_COLUMN_FAMILY,
                        Bytes.toBytes( entry.getKey() ),
                        Bytes.toBytes( entry.getValue() )   );
            }

            put.addColumn(
                    HISTORY_COLUMN_FAMILY,
                    Bytes.toBytes( ORDER_COLUMN_NAME_ORDER_ID ),
                    Bytes.toBytes( orderId )   );

            return put;
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        //读取配置
        Configuration conf = this.getConf();
        //创建Job
        Job job = Job.getInstance( conf, F_SaleOrdersMapReducer.class.getName() );
        job.setJarByClass( F_SaleOrdersMapReducer.class );

        //设置Job:
        //input:table  ->map ->output:table
        Scan scan = new Scan();
        // 1 is the default in Scan, which will be bad for MapReduce jobs
        scan.setCaching(500);
        // don't set to true for MR jobs
        scan.setCacheBlocks(false);

        //设置Mapper类和Input table
        TableMapReduceUtil.initTableMapperJob(
                ORDERS_TABLE_NAME,        // input HBase table name
                scan,             // Scan instance to control CF and attribute selection
                ReadOrderMapper.class,   // mapper
                ImmutableBytesWritable.class, // mapper output key,RowKey
                Put.class,        // mapper output value,行内容
                job);
        //设置输出以及Reducer
        TableMapReduceUtil.initTableReducerJob(
                HISTORY_ORDERS_TABLE_NAME,      // output table
                null,             // reducer class
                job);
        job.setNumReduceTasks(0);

        boolean isSuccess = job.waitForCompletion( true );
        return isSuccess?0:1;
    }


    public static void main(String[] args) {
        //HBase配置文件
        Configuration conf = HBaseConfiguration.create();
        try {
            //运行job
            int status = ToolRunner.run( conf, new F_SaleOrdersMapReducer(), args );
            //结束程序
            System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

自定义MapReduce读出HBase中的数据,并将其存放到指定的HDFS目录下,并将数据加载到HBase中的另一张表中

以下操作需要预先在HBase中创建目标表

并输入暂时存放数据的HDFS目录



import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class G_SaleOrdersMapReducer extends Configured implements Tool {
    private final static String ORDERS_TABLE_NAME="ns1:orders";
    private final static String HISTORY_ORDERS_TABLE_NAME="orders:history_orders89";


    static class ReadOrderMapper extends TableMapper{
        private final static String ORDER_COLUMN_NAME_USER_ID = "user_id";
        private final static String ORDER_COLUMN_NAME_ORDER_ID = "order_id";
        private final static String ORDER_COLUMN_NAME_DATE = "date";
        private final static String HISTORY_ROW_KEY_SEPARATOR = "_";
        private final static byte[] HISTORY_COLUMN_FAMILY= Bytes.toBytes( "order" );

        private ImmutableBytesWritable mapOutput = new ImmutableBytesWritable(  );
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

            //编写专门的方法,转换数据,得到Put对象
            Put put = resultToPut(key,value);

            //输出rowKey
            mapOutput.set( put.getRow() );
            //输出
            context.write( mapOutput,put );
        }

        private Put resultToPut(ImmutableBytesWritable key, Result result) {
            //订单Id
            String orderId = Bytes.toString( key.get() );
            //date,user_id,order_amt
            HashMap orderMap = new HashMap<>();
            for (Cell cell:result.rawCells()) {
                String filed = Bytes.toString(CellUtil.cloneQualifier( cell ));
                String value = Bytes.toString(CellUtil.cloneValue( cell ));
                orderMap.put( filed ,value);
            }

            //组合rowKey:userId + orderDate + orderId
            StringBuffer sb = new StringBuffer();
            //reverse(userId)
            sb.append( orderMap.get( ORDER_COLUMN_NAME_USER_ID ) ).reverse();
            sb.append( HISTORY_ROW_KEY_SEPARATOR );
            //date
            sb.append( orderMap.get( ORDER_COLUMN_NAME_DATE )  );
            sb.append( HISTORY_ROW_KEY_SEPARATOR );
            sb.append( orderId );

            //创建Put对象
            Put put = new Put(Bytes.toBytes( sb.toString() ));
            for (Map.Entry entry:orderMap.entrySet()) {
                put.addColumn(
                        HISTORY_COLUMN_FAMILY,
                        Bytes.toBytes( entry.getKey() ),
                        Bytes.toBytes( entry.getValue() )   );
            }

            put.addColumn(
                    HISTORY_COLUMN_FAMILY,
                    Bytes.toBytes( ORDER_COLUMN_NAME_ORDER_ID ),
                    Bytes.toBytes( orderId )   );

            return put;
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        //读取配置
        Configuration conf = this.getConf();
        //创建Job
        Job job = Job.getInstance( conf, G_SaleOrdersMapReducer.class.getName() );
        job.setJarByClass( G_SaleOrdersMapReducer.class );

        //设置Job:
        //input:table  ->map ->output:table
        Scan scan = new Scan();
        // 1 is the default in Scan, which will be bad for MapReduce jobs
        scan.setCaching(500);
        // don't set to true for MR jobs
        scan.setCacheBlocks(false);

        //设置Mapper类和Input table
        TableMapReduceUtil.initTableMapperJob(
                ORDERS_TABLE_NAME,        // input HBase table name
                scan,             // Scan instance to control CF and attribute selection
                ReadOrderMapper.class,   // mapper
                ImmutableBytesWritable.class, // mapper output key,RowKey
                Put.class,        // mapper output value,行内容
                job);
        //设置输出以及Reducer
        TableMapReduceUtil.initTableReducerJob(
                HISTORY_ORDERS_TABLE_NAME,      // output table
                null,             // reducer class
                job);
        job.setNumReduceTasks(0);

        //如果数据量非常大的情况下,不建议使用put方式将数据插入到HBASE表中,
        //而是将数据转成HBASE数据存储的HFile

        //设置MapReduce输出的数据格式
        job.setOutputFormatClass( HFileOutputFormat2.class );

        //往那张表里面写
        HTable table = new HTable( conf, HISTORY_ORDERS_TABLE_NAME );
        HFileOutputFormat2.configureIncrementalLoad( job, table,table.getRegionLocator());

        //设置HFile文件的输出目录
        Path outputPath = new Path(args[0] + System.currentTimeMillis());
        FileOutputFormat.setOutputPath( job, outputPath);

        boolean isSuccess = job.waitForCompletion( true );
        //如果MapReduce运行完成,成功之后,将输出HFile文件 加载到 表中
        if(isSuccess){
            LoadIncrementalHFiles load = new LoadIncrementalHFiles( conf );
            load.doBulkLoad( outputPath, table);
        }
        return isSuccess?0:1;
    }


    public static void main(String[] args) {
        //HBase配置文件
        Configuration conf = HBaseConfiguration.create();
        try {
            //运行job
            int status = ToolRunner.run( conf, new G_SaleOrdersMapReducer(), args );
            //结束程序
            System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

 

你可能感兴趣的:(HBase)