HBase之MapReduce
由于HBase2.0API
变化较大,HBase-MapReduce
也有所变化,本文以2.0API
为准做一些HBase
的MapReduce
操作。
首先是POM
文件:
简化版:HBAse2.1.1 + Hadoop2.7.7
4.0.0
com.edu.hadoop
eduHadoop
0.0.1-SNAPSHOT
UTF-8
2.7.7
junit
junit
3.8.1
test
org.jsoup
jsoup
1.10.1
org.apache.kafka
kafka_2.12
1.1.0
com.alibaba
fastjson
1.2.4
org.apache.hbase
hbase-common
2.1.1
org.apache.hbase
hbase-client
2.1.1
org.apache.hbase
hbase-mapreduce
2.1.1
org.apache.hbase
hbase-server
2.1.1
org.apache.hbase
hbase-endpoint
2.1.1
org.apache.hbase
hbase-metrics-api
2.1.1
org.apache.hbase
hbase-thrift
2.1.1
org.apache.hbase
hbase-rest
2.1.1
org.apache.hadoop
hadoop-client
${hadoop.version}
org.apache.hadoop
hadoop-common
${hadoop.version}
org.apache.hadoop
hadoop-hdfs
${hadoop.version}
org.apache.maven.plugins
maven-compiler-plugin
3.6.1
1.8
详细版本:Hbase2.1.1 + Hadoop2 兼容 Hadoop3
4.0.0
hbase-build-configuration
org.apache.hbase
2.1.1
../hbase-build-configuration
hbase-examples
Apache HBase - Examples
Examples of HBase usage
maven-assembly-plugin
true
maven-surefire-plugin
${surefire.firstPartGroups}
org.apache.maven.plugins
maven-source-plugin
org.xolstice.maven.plugins
protobuf-maven-plugin
compile-protoc
generate-sources
compile
net.revelc.code
warbucks-maven-plugin
org.eclipse.m2e
lifecycle-mapping
org.apache.maven.plugins
maven-dependency-plugin
[2.8,)
build-classpath
org.apache.hbase.thirdparty
hbase-shaded-miscellaneous
org.apache.hbase.thirdparty
hbase-shaded-netty
org.apache.hbase
hbase-common
org.apache.hbase
hbase-protocol
org.apache.hbase
hbase-client
org.apache.hbase
hbase-server
org.apache.hbase
hbase-mapreduce
org.apache.hbase
hbase-endpoint
org.apache.hbase
hbase-thrift
org.apache.hbase
hbase-metrics-api
org.apache.hbase
hbase-testing-util
test
org.apache.thrift
libthrift
commons-io
commons-io
org.slf4j
slf4j-api
org.apache.zookeeper
zookeeper
com.google.protobuf
protobuf-java
org.apache.curator
curator-framework
org.apache.curator
curator-client
org.apache.curator
curator-recipes
com.github.stephenc.findbugs
findbugs-annotations
org.apache.hbase
hbase-rest
junit
junit
test
org.mockito
mockito-core
test
skipExamplesTests
skipExamplesTests
true
true
hadoop-2.0
!hadoop.profile
org.apache.hadoop
hadoop-mapreduce-client-core
org.apache.hadoop
hadoop-common
maven-dependency-plugin
create-mrapp-generated-classpath
generate-test-resources
build-classpath
${project.build.directory}/test-classes/mrapp-generated-classpath
hadoop-3.0
hadoop.profile
3.0
3.0-SNAPSHOT
org.apache.hadoop
hadoop-common
org.apache.hadoop
hadoop-minicluster
maven-dependency-plugin
create-mrapp-generated-classpath
generate-test-resources
build-classpath
${project.build.directory}/test-classes/mrapp-generated-classpath
接下来进入正题
使用MapReduce将HDFS数据导入到HBase
package com.test;
import java.io.IOException;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.yetus.audience.InterfaceAudience;
@InterfaceAudience.Private
public class SampleUploader extends Configured implements Tool {
private static final String NAME = "SampleUploader";
static class Uploader
extends Mapper {
private long checkpoint = 100;
private long count = 0;
@Override
public void map(LongWritable key, Text line, Context context)
throws IOException {
// Each map() is a single line, where the key is the line number
// 每一行的数据格式需遵守: row,family,qualifier,value
// Split CSV line
String [] values = line.toString().split(",");
if(values.length != 4) {
return;
}
// Extract each value
byte [] row = Bytes.toBytes(values[0]);
byte [] family = Bytes.toBytes(values[1]);
byte [] qualifier = Bytes.toBytes(values[2]);
byte [] value = Bytes.toBytes(values[3]);
// Create Put
Put put = new Put(row);
put.addColumn(family, qualifier, value);
// Uncomment below to disable WAL. This will improve performance but means
// you will experience data loss in the case of a RegionServer crash.
// put.setWriteToWAL(false);
try {
context.write(new ImmutableBytesWritable(row), put);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Set status every checkpoint lines
if(++count % checkpoint == 0) {
context.setStatus("Emitting Put " + count);
}
}
}
/**
* Job configuration.
*/
public static Job configureJob(Configuration conf, String[] args)
throws IOException {
Path inputPath = new Path(args[0]);
String tableName = args[1];
Job job = new Job(conf, NAME + "_" + tableName);
job.setJarByClass(Uploader.class);
FileInputFormat.setInputPaths(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(Uploader.class);
// No reducers. Just write straight to table. Call initTableReducerJob
// because it sets up the TableOutputFormat.
TableMapReduceUtil.initTableReducerJob(tableName, null, job);
job.setNumReduceTasks(0);
return job;
}
/**
* Main entry point.
*
* @param otherArgs The command line parameters after ToolRunner handles standard.
* @throws Exception When running the job fails.
*/
public int run(String[] otherArgs) throws Exception {
if(otherArgs.length != 2) {
System.err.println("Wrong number of arguments: " + otherArgs.length);
System.err.println("Usage: " + NAME + " ");
return -1;
}
Configuration conf = getConf();
//在本地调试使用
conf.set("fs.defaultFS", "hdfs://192.168.0.10:9000");
conf.set("hbase.zookeeper.quorum", "192.168.0.10"); //hbase 服务地址
conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
System.setProperty("HADOOP_USER_NAME", "root");
Job job = configureJob(conf, otherArgs);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
Scanner sc = new Scanner(System.in);
//两个参数一个是Hadoop 中文件的路径 一个是要传入数据的表名
String arg1 = sc.next();
String arg2 = sc.next();
int status = ToolRunner.run(HBaseConfiguration.create(), new SampleUploader(), new String[]{arg1,arg2});
System.exit(status);
}
}
将HBase表中的数据复制到另一张表且调换row和value
package com.test;
import java.io.IOException;
import java.util.Scanner;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.MultiTableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.yetus.audience.InterfaceAudience;
@InterfaceAudience.Private
public class IndexBuilder extends Configured implements Tool {
/** the column family containing the indexed row key */
public static final byte[] INDEX_COLUMN = Bytes.toBytes("INDEX");
/** the qualifier containing the indexed row key */
public static final byte[] INDEX_QUALIFIER = Bytes.toBytes("ROW");
/**
* Internal Mapper to be run by Hadoop.
*/
public static class Map extends
Mapper {
private byte[] family;
private TreeMap indexes;
@Override
protected void map(ImmutableBytesWritable rowKey, Result result, Context context)
throws IOException, InterruptedException {
for (java.util.Map.Entry index : indexes.entrySet()) {
byte[] qualifier = index.getKey();
ImmutableBytesWritable tableName = index.getValue();
byte[] value = result.getValue(family, qualifier);
if (value != null) {
// original: row 123 attribute:phone 555-1212
// index: row 555-1212 INDEX:ROW 123
//将row 和value 换位置
Put put = new Put(value);
put.addColumn(INDEX_COLUMN, INDEX_QUALIFIER, rowKey.get());
context.write(tableName, put);
}
}
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration configuration = context.getConfiguration();
String tableName = configuration.get("index.tablename");
String[] fields = configuration.getStrings("index.fields");
String familyName = configuration.get("index.familyname");
family = Bytes.toBytes(familyName);
String familystr = Bytes.toString(family);
indexes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
for (String field : fields) {
// if the table is "people" and the field to index is "email", then the
// index table will be called "people-email"
indexes.put(Bytes.toBytes(field),
new ImmutableBytesWritable(Bytes.toBytes(tableName + "-" + field)));
}
}
}
/**
* Job configuration.
*/
public static Job configureJob(Configuration conf, String[] args)
throws IOException {
String tableName = args[0];
String columnFamily = args[1];
System.out.println("****" + tableName);
conf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(new Scan()));
conf.set(TableInputFormat.INPUT_TABLE, tableName);
conf.set("index.tablename", tableName);
conf.set("index.familyname", columnFamily);
String[] fields = new String[args.length - 2];
System.arraycopy(args, 2, fields, 0, fields.length);
conf.setStrings("index.fields", fields);
Job job = new Job(conf, tableName);
job.setJarByClass(IndexBuilder.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TableInputFormat.class);
job.setOutputFormatClass(MultiTableOutputFormat.class);
return job;
}
public int run(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create(getConf());
conf.set("hbase.zookeeper.quorum", "192.168.0.10"); //hbase 服务地址
conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
Scanner sc = new Scanner(System.in);
/*if(args.length < 3) {
//使用命令行参数
System.err.println("Only " + args.length + " arguments supplied, required: 3");
System.err.println("Usage: IndexBuilder [ ...]");
System.exit(-1);
}*/
String arg1 = sc.next();
String arg2 = sc.next();
String arg3 = sc.next();
Job job = configureJob(conf, new String[]{arg1,arg2,arg3});
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int result = ToolRunner.run(HBaseConfiguration.create(), new IndexBuilder(), args);
System.exit(result);
}
}