开始学习使用Hbase,不知道对不对 但是先从源码开始读起吧....
hadoop mr 输出需要导入hbase的话最好先输出成HFile格式, 再导入到HBase,因为HFile是HBase的内部存
对应的源码为:
/**
* Copyright 2009 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.mapreduce;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Writes HFiles. Passed KeyValues must arrive in order.
* Currently, can only write files to a single column family at a
* time. Multiple column families requires coordinating keys cross family.
* Writes current time as the sequence id for the file. Sets the major compacted
* attribute on created hfiles.
* @see KeyValueSortReducer
*/
public class HFileOutputFormat extends FileOutputFormat {
static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
//继承的FileOutputFile
public RecordWriter getRecordWriter(final TaskAttemptContext context)//keyValue是什么值?
throws IOException, InterruptedException {
// Get the path of the temporary output file获得输出文件的临时路径
final Path outputPath = FileOutputFormat.getOutputPath(context);
final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
Configuration conf = context.getConfiguration();
final FileSystem fs = outputdir.getFileSystem(conf);
// These configs. are from hbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数
final long maxsize = conf.getLong("hbase.hregion.max.filesize", 268435456);
final int blocksize =
conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize", 65536);
// Invented config. Add to hbase-*.xml if other than default compression.
final String compression = conf.get("hfile.compression",
Compression.Algorithm.NONE.getName());
return new RecordWriter() {
// Map of families to writers and how much has been output on the writer.将列簇映射到writer上去
private final Map writers =//使用map作为容器
new TreeMap(Bytes.BYTES_COMPARATOR);
private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
private final byte [] now = Bytes.toBytes(System.currentTimeMillis());//当前时间
public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
throws IOException {
long length = kv.getLength();
byte [] family = kv.getFamily();//获取列簇
WriterLength wl = this.writers.get(family);
if (wl == null || ((length + wl.written) >= maxsize) &&
Bytes.compareTo(this.previousRow, 0, this.previousRow.length,
kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) != 0) {
// Get a new writer.
Path basedir = new Path(outputdir, Bytes.toString(family));
if (wl == null) {
wl = new WriterLength();
this.writers.put(family, wl);
if (this.writers.size() > 1) throw new IOException("One family only");
// If wl == null, first file in family. Ensure family dir exits.
if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
}
wl.writer = getNewWriter(wl.writer, basedir);
LOG.info("Writer=" + wl.writer.getPath() +
((wl.written == 0)? "": ", wrote=" + wl.written));
wl.written = 0;
}
kv.updateLatestStamp(this.now);//
wl.writer.append(kv);
wl.written += length;
// Copy the row so we know when a row transition.
this.previousRow = kv.getRow();
}
/* Create a new HFile.Writer. Close current if there is one.创建HFile的writer
* @param writer
* @param familydir
* @return A new HFile.Writer.
* @throws IOException
*/
private HFile.Writer getNewWriter(final HFile.Writer writer,
final Path familydir)
throws IOException {
close(writer);
return new HFile.Writer(fs, StoreFile.getUniqueFile(fs, familydir),
blocksize, compression, KeyValue.KEY_COMPARATOR);
}
private void close(final HFile.Writer w) throws IOException {
if (w != null) {
w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
Bytes.toBytes(System.currentTimeMillis()));
w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
Bytes.toBytes(context.getTaskAttemptID().toString()));
w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
Bytes.toBytes(true));
w.close();
}
}
public void close(TaskAttemptContext c)
throws IOException, InterruptedException {
for (Map.Entry e: this.writers.entrySet()) {
close(e.getValue().writer);
}
}
};
}
/*
* Data structure to hold a Writer and amount of data written on it.
*/
static class WriterLength {
long written = 0;
HFile.Writer writer = null;
}
/**
* Return the start keys of all of the regions in this table,返回region的起始key
* as a list of ImmutableBytesWritable.
*/
private static List getRegionStartKeys(HTable table)
throws IOException {
byte[][] byteKeys = table.getStartKeys();
ArrayList ret =
new ArrayList(byteKeys.length);
for (byte[] byteKey : byteKeys) {
ret.add(new ImmutableBytesWritable(byteKey));
}
return ret;
}
/**
* Write out a SequenceFile that can be read by TotalOrderPartitioner序列化写入分片
* that contains the split points in startKeys.
* @param partitionsPath output path for SequenceFile
* @param startKeys the region start keys
*/
private static void writePartitions(Configuration conf, Path partitionsPath,
List startKeys) throws IOException {
if (startKeys.isEmpty()) {
throw new IllegalArgumentException("No regions passed");
}
// We're generating a list of split points, and we don't ever
// have keys < the first region (which has an empty start key)
// so we need to remove it. Otherwise we would end up with an
// empty reducer with index 0
TreeSet sorted =
new TreeSet(startKeys);
ImmutableBytesWritable first = sorted.first();
if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
throw new IllegalArgumentException(
"First region of table should have empty start key. Instead has: "
+ Bytes.toStringBinary(first.get()));
}
sorted.remove(first);
// Write the actual file
FileSystem fs = partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer = SequenceFile.createWriter(fs,
conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
try {
for (ImmutableBytesWritable startKey : sorted) {
writer.append(startKey, NullWritable.get());
}
} finally {
writer.close();
}
}
/**
* Configure a MapReduce Job to perform an incremental load into the given//什么意思?
* table. This
*
* - Inspects the table to configure a total order partitioner
* - Uploads the partitions file to the cluster and adds it to the DistributedCache
* - Sets the number of reduce tasks to match the current number of regions
* - Sets the output key/value class to match HFileOutputFormat's requirements
* - Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
* PutSortReducer)
*
* The user should be sure to set the map output value class to either KeyValue or Put before
* running this function.
*/
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗
Configuration conf = job.getConfiguration();
job.setPartitionerClass(TotalOrderPartitioner.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setOutputFormatClass(HFileOutputFormat.class);
// Based on the configured map output class, set the correct reducer to properly
// sort the incoming values.
// TODO it would be nice to pick one or the other of these formats.
if (KeyValue.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(KeyValueSortReducer.class);
} else if (Put.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(PutSortReducer.class);
} else {
LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
}
LOG.info("Looking up current regions for table " + table);
List startKeys = getRegionStartKeys(table);
LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
"to match current region count");
job.setNumReduceTasks(startKeys.size());
Path partitionsPath = new Path(job.getWorkingDirectory(),
"partitions_" + System.currentTimeMillis());
LOG.info("Writing partition information to " + partitionsPath);
FileSystem fs = partitionsPath.getFileSystem(conf);
writePartitions(conf, partitionsPath, startKeys);
partitionsPath.makeQualified(fs);
URI cacheUri;
try {
cacheUri = new URI(partitionsPath.toString() + "#" +
TotalOrderPartitioner.DEFAULT_PATH);
} catch (URISyntaxException e) {
throw new IOException(e);
}
DistributedCache.addCacheFile(cacheUri, conf);
DistributedCache.createSymlink(conf);
LOG.info("Incremental table output configured.");
}
}
下面给出一个示例:
1. 创建HBase表t1
hbase(main):157:0* create 't1','f1'
0 row(s) in 1.3280 seconds
hbase(main):158:0> scan 't1'
ROW COLUMN+CELL
0 row(s) in 1.2770 seconds
2.写MR作业
package com.test.hfile;
import java.io.IOException;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class HBaseHFileMapper extends Mapper {
private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
@Override
protected void map(LongWritable key, Text value,
org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key
context.write(immutableBytesWritable, value);
}
}
HBaseHFileReducer.java
package com.test.hfile;
import java.io.IOException;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class HBaseHFileReducer extends Reducer {
protected void reduce(ImmutableBytesWritable key, Iterable values,
Context context)
throws IOException, InterruptedException {
String value="";
while(values.iterator().hasNext())
{
value = values.iterator().next().toString();
if(value != null && !"".equals(value)) //不为空
{
KeyValue kv = createKeyValue(value.toString());//keyValue就是value值
if(kv!=null)
context.write(key, kv);
}
}
}
// str格式为row:family:qualifier:value 简单模拟下
private KeyValue createKeyValue(String str)
{
String[] strstrs = str.split(":");
if(strs.length<4)
return null;
String row=strs[0];
String family=strs[1];
String qualifier=strs[2];
String value=strs[3];
return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(), Bytes.toBytes(value));
}
}
HbaseHFileDriver.java
package com.test.hfile;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class HbaseHFileDriver {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "testhbasehfile");
job.setJarByClass(HbaseHFileDriver.class); //主类
job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);
job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Text.class);
// 偷懒, 直接写死在程序里了,实际应用中不能这样, 应从命令行获取,无语了对这个
FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));
FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));
Configuration HBASE_CONFIG = new Configuration();
HBASE_CONFIG.set("hbase.zookeeper.quorum", "localhost");
HBASE_CONFIG.set("hbase.zookeeper.property.clientPort", "2181");
HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);
String tableName = "t1";
HTable htable = new HTable(cfg, tableName);
HFileOutputFormat.configureIncrementalLoad(job, htable);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
/home/yinjie/input目录下有一个hbasedata.txt文件,内容为
[root@localhost input]# cat hbasedata.txt
r1:f1:c1:value1
r2:f1:c2:value2
r3:f1:c3:value3
将作业打包,我的到处路径为/home/yinjie/job/hbasetest.jar
[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jar com.test.hfile.HbaseHFileDriver -libjars
/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:
接下去使用Bulk Load将数据导入到HBbase
[root@localhost job]# hadoop jar /home/yinjie/hbase-0.90.3/hbase-0.90.3.jar completebulkload
导入完毕,查询hbase表t1进行验证
本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244