HBase 2.0 MapReduce

HBase之MapReduce

由于HBase2.0API变化较大,HBase-MapReduce也有所变化,本文以2.0API为准做一些HBaseMapReduce操作。

首先是POM文件:

简化版:HBAse2.1.1 + Hadoop2.7.7


    4.0.0
    com.edu.hadoop
    eduHadoop
    0.0.1-SNAPSHOT

    
        UTF-8
        2.7.7
    
    

        
            junit
            junit
            3.8.1
            test
        

        
            org.jsoup
            jsoup
            1.10.1
        

        
            org.apache.kafka
            kafka_2.12
            1.1.0
        
        
            com.alibaba
            fastjson
            1.2.4
        
        
            org.apache.hbase
            hbase-common
            2.1.1
        
        
            org.apache.hbase
            hbase-client
            2.1.1
        
        
            org.apache.hbase
            hbase-mapreduce
            2.1.1
        
        
            org.apache.hbase
            hbase-server
            2.1.1
        
        
            org.apache.hbase
            hbase-endpoint
            2.1.1
        
        
            org.apache.hbase
            hbase-metrics-api
            2.1.1
        
        
            org.apache.hbase
            hbase-thrift
            2.1.1
        
        
            org.apache.hbase
            hbase-rest
            2.1.1
        
        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        
        
            org.apache.hadoop
            hadoop-common
            ${hadoop.version}
        
        
            org.apache.hadoop
            hadoop-hdfs
            ${hadoop.version}
        

    

    
        
            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.6.1
                
                    1.8
                    1.8
                
            
        
    


详细版本:Hbase2.1.1 + Hadoop2 兼容 Hadoop3



  
  4.0.0
  
    hbase-build-configuration
    org.apache.hbase
    2.1.1
    ../hbase-build-configuration
  
  hbase-examples
  Apache HBase - Examples
  Examples of HBase usage
  
  
    
      
        
        maven-assembly-plugin
        
          true
        
      
      
        maven-surefire-plugin
        
          
          ${surefire.firstPartGroups}
        
      
      
      
        org.apache.maven.plugins
        maven-source-plugin
      
      
        org.xolstice.maven.plugins
        protobuf-maven-plugin
        
          
            compile-protoc
            generate-sources
            
              compile
            
          
        
      
      
        net.revelc.code
        warbucks-maven-plugin
      
    
    
      
        
        
          org.eclipse.m2e
          lifecycle-mapping
          
            
              
                
                  
                    org.apache.maven.plugins
                    maven-dependency-plugin
                    [2.8,)
                    
                      build-classpath
                    
                  
                  
                    
                  
                
              
            
          
        
      
    
  
  
    
      org.apache.hbase.thirdparty
      hbase-shaded-miscellaneous
    
    
      org.apache.hbase.thirdparty
      hbase-shaded-netty
    
    
      org.apache.hbase
      hbase-common
    
    
      org.apache.hbase
      hbase-protocol
    
    
      org.apache.hbase
      hbase-client
    
    
      org.apache.hbase
      hbase-server
    
    
      org.apache.hbase
      hbase-mapreduce
    
    
      org.apache.hbase
      hbase-endpoint
    
    
      org.apache.hbase
      hbase-thrift
    
    
      org.apache.hbase
      hbase-metrics-api
    
    
      org.apache.hbase
      hbase-testing-util
      test
    
    
      org.apache.thrift
      libthrift
    
    
      commons-io
      commons-io
    
    
      org.slf4j
      slf4j-api
    
    
      org.apache.zookeeper
      zookeeper
    
    
      com.google.protobuf
      protobuf-java
    
    
      org.apache.curator
      curator-framework
    
    
      org.apache.curator
      curator-client
    
    
      org.apache.curator
      curator-recipes
    
    
      com.github.stephenc.findbugs
      findbugs-annotations
    
    
      org.apache.hbase
      hbase-rest
    
    
      junit
      junit
      test
    
    
      org.mockito
      mockito-core
      test
    
  
  
    
    
      skipExamplesTests
      
        
          skipExamplesTests
        
      
      
        true
        true
      
    
    
    
    
    
      hadoop-2.0
      
        
          
          
          !hadoop.profile
        
      
      
        
          org.apache.hadoop
          hadoop-mapreduce-client-core
        
        
          org.apache.hadoop
          hadoop-common
        
      
      
        
          
            maven-dependency-plugin
            
              
                create-mrapp-generated-classpath
                generate-test-resources
                
                  build-classpath
                
                
                  
                  ${project.build.directory}/test-classes/mrapp-generated-classpath
                
              
            
          
        
      
    
    
    
      hadoop-3.0
      
        
          hadoop.profile
          3.0
        
      
      
        3.0-SNAPSHOT
      
      
        
          org.apache.hadoop
          hadoop-common
        
        
          org.apache.hadoop
          hadoop-minicluster
        
      
      
        
          
            maven-dependency-plugin
            
              
                create-mrapp-generated-classpath
                generate-test-resources
                
                  build-classpath
                
                
                  
                  ${project.build.directory}/test-classes/mrapp-generated-classpath
                
              
            
          
        
      
    
  


接下来进入正题

使用MapReduce将HDFS数据导入到HBase
package com.test;

import java.io.IOException;
import java.util.Scanner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.yetus.audience.InterfaceAudience;

@InterfaceAudience.Private
public class SampleUploader extends Configured implements Tool {

  private static final String NAME = "SampleUploader";

  static class Uploader
  extends Mapper {

    private long checkpoint = 100;
    private long count = 0;

    @Override
    public void map(LongWritable key, Text line, Context context)
    throws IOException {

      // Each map() is a single line, where the key is the line number
      // 每一行的数据格式需遵守: row,family,qualifier,value

      // Split CSV line
      String [] values = line.toString().split(",");
      if(values.length != 4) {
        return;
      }

      // Extract each value
      byte [] row = Bytes.toBytes(values[0]);
      byte [] family = Bytes.toBytes(values[1]);
      byte [] qualifier = Bytes.toBytes(values[2]);
      byte [] value = Bytes.toBytes(values[3]);

      // Create Put
      Put put = new Put(row);
      put.addColumn(family, qualifier, value);

      // Uncomment below to disable WAL. This will improve performance but means
      // you will experience data loss in the case of a RegionServer crash.
      // put.setWriteToWAL(false);

      try {
        context.write(new ImmutableBytesWritable(row), put);
      } catch (InterruptedException e) {
        e.printStackTrace();
      }

      // Set status every checkpoint lines
      if(++count % checkpoint == 0) {
        context.setStatus("Emitting Put " + count);
      }
    }
  }

  /**
   * Job configuration.
   */
  public static Job configureJob(Configuration conf, String[] args)
  throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
  }

  /**
   * Main entry point.
   *
   * @param otherArgs  The command line parameters after ToolRunner handles standard.
   * @throws Exception When running the job fails.
   */
  public int run(String[] otherArgs) throws Exception {
    if(otherArgs.length != 2) {
      System.err.println("Wrong number of arguments: " + otherArgs.length);
      System.err.println("Usage: " + NAME + "  ");
      return -1;
    }
    Configuration conf = getConf();
    //在本地调试使用
    conf.set("fs.defaultFS", "hdfs://192.168.0.10:9000");
    conf.set("hbase.zookeeper.quorum", "192.168.0.10");  //hbase 服务地址
    conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
    System.setProperty("HADOOP_USER_NAME", "root");
    Job job = configureJob(conf, otherArgs);
    return (job.waitForCompletion(true) ? 0 : 1);
  }

  public static void main(String[] args) throws Exception {
    Scanner sc = new Scanner(System.in);
  //两个参数一个是Hadoop 中文件的路径    一个是要传入数据的表名
    String arg1 = sc.next();
    String arg2 = sc.next();
    int status = ToolRunner.run(HBaseConfiguration.create(), new SampleUploader(), new String[]{arg1,arg2});
    System.exit(status);
  }
}

将HBase表中的数据复制到另一张表且调换row和value


package com.test;

import java.io.IOException;
import java.util.Scanner;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.MultiTableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.yetus.audience.InterfaceAudience;


@InterfaceAudience.Private
public class IndexBuilder extends Configured implements Tool {
    /** the column family containing the indexed row key */
    public static final byte[] INDEX_COLUMN = Bytes.toBytes("INDEX");
    /** the qualifier containing the indexed row key */
    public static final byte[] INDEX_QUALIFIER = Bytes.toBytes("ROW");

    /**
     * Internal Mapper to be run by Hadoop.
     */
    public static class Map extends
            Mapper {
        private byte[] family;
        private TreeMap indexes;

        @Override
        protected void map(ImmutableBytesWritable rowKey, Result result, Context context)
                throws IOException, InterruptedException {
            for (java.util.Map.Entry index : indexes.entrySet()) {
                byte[] qualifier = index.getKey();
                ImmutableBytesWritable tableName = index.getValue();
                byte[] value = result.getValue(family, qualifier);
                if (value != null) {
                    // original: row 123 attribute:phone 555-1212
                    // index: row 555-1212 INDEX:ROW 123
                    //将row 和value 换位置
                    Put put = new Put(value);
                    put.addColumn(INDEX_COLUMN, INDEX_QUALIFIER, rowKey.get());
                    context.write(tableName, put);
                }
            }
        }

        @Override
        protected void setup(Context context) throws IOException,
                InterruptedException {
            Configuration configuration = context.getConfiguration();
            String tableName = configuration.get("index.tablename");
            String[] fields = configuration.getStrings("index.fields");
            String familyName = configuration.get("index.familyname");
            family = Bytes.toBytes(familyName);
            String familystr = Bytes.toString(family);
            indexes = new TreeMap<>(Bytes.BYTES_COMPARATOR);

            for (String field : fields) {
                // if the table is "people" and the field to index is "email", then the
                // index table will be called "people-email"
                indexes.put(Bytes.toBytes(field),
                        new ImmutableBytesWritable(Bytes.toBytes(tableName + "-" + field)));
            }
        }
    }

    /**
     * Job configuration.
     */
    public static Job configureJob(Configuration conf, String[] args)
            throws IOException {
        String tableName = args[0];
        String columnFamily = args[1];
        System.out.println("****" + tableName);
        conf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(new Scan()));
        conf.set(TableInputFormat.INPUT_TABLE, tableName);
        conf.set("index.tablename", tableName);
        conf.set("index.familyname", columnFamily);
        String[] fields = new String[args.length - 2];
        System.arraycopy(args, 2, fields, 0, fields.length);
        conf.setStrings("index.fields", fields);
        Job job = new Job(conf, tableName);
        job.setJarByClass(IndexBuilder.class);
        job.setMapperClass(Map.class);
        job.setNumReduceTasks(0);
        job.setInputFormatClass(TableInputFormat.class);
        job.setOutputFormatClass(MultiTableOutputFormat.class);
        return job;
    }

    public int run(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create(getConf());
        conf.set("hbase.zookeeper.quorum", "192.168.0.10");  //hbase 服务地址
        conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
        Scanner sc = new Scanner(System.in);
        /*if(args.length < 3) {
          //使用命令行参数
          System.err.println("Only " + args.length + " arguments supplied, required: 3");
          System.err.println("Usage: IndexBuilder    [ ...]");
          System.exit(-1);
        }*/
        String arg1 = sc.next();
        String arg2 = sc.next();
        String arg3 = sc.next();

        Job job = configureJob(conf, new String[]{arg1,arg2,arg3});
        return (job.waitForCompletion(true) ? 0 : 1);
    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run(HBaseConfiguration.create(), new IndexBuilder(), args);
        System.exit(result);
    }
}

你可能感兴趣的:(HBase 2.0 MapReduce)