Hbse源码分析-HFileOutputFormat

开始学习使用Hbase,不知道对不对 但是先从源码开始读起吧....

hadoop mr 输出需要导入hbase的话最好先输出成HFile格式, 再导入到HBase,因为HFile是HBase的内部存

对应的源码为:

/**
 * Copyright 2009 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Writes HFiles. Passed KeyValues must arrive in order.
 * Currently, can only write files to a single column family at a
 * time.  Multiple column families requires coordinating keys cross family.
 * Writes current time as the sequence id for the file. Sets the major compacted
 * attribute on created hfiles.
 * @see KeyValueSortReducer
 */
public class HFileOutputFormat extends FileOutputFormat {
  static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
  //继承的FileOutputFile
  public RecordWriter getRecordWriter(final TaskAttemptContext context)//keyValue是什么值?
  throws IOException, InterruptedException {
    // Get the path of the temporary output file获得输出文件的临时路径
    final Path outputPath = FileOutputFormat.getOutputPath(context);
    final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
    Configuration conf = context.getConfiguration();
    final FileSystem fs = outputdir.getFileSystem(conf);
    // These configs. are from hbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数
    final long maxsize = conf.getLong("hbase.hregion.max.filesize", 268435456);
    final int blocksize =
      conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize", 65536);
    // Invented config.  Add to hbase-*.xml if other than default compression.
    final String compression = conf.get("hfile.compression",
      Compression.Algorithm.NONE.getName());

    return new RecordWriter() {
      // Map of families to writers and how much has been output on the writer.将列簇映射到writer上去
      private final Map writers =//使用map作为容器
        new TreeMap(Bytes.BYTES_COMPARATOR);
      private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
      private final byte [] now = Bytes.toBytes(System.currentTimeMillis());//当前时间

      public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
      throws IOException {
        long length = kv.getLength();
        byte [] family = kv.getFamily();//获取列簇
        WriterLength wl = this.writers.get(family);
        if (wl == null || ((length + wl.written) >= maxsize) &&
            Bytes.compareTo(this.previousRow, 0, this.previousRow.length,
              kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) != 0) {
          // Get a new writer.
          Path basedir = new Path(outputdir, Bytes.toString(family));
          if (wl == null) {
            wl = new WriterLength();
            this.writers.put(family, wl);
            if (this.writers.size() > 1) throw new IOException("One family only");
            // If wl == null, first file in family.  Ensure family dir exits.
            if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
          }
          wl.writer = getNewWriter(wl.writer, basedir);
          LOG.info("Writer=" + wl.writer.getPath() +
            ((wl.written == 0)? "": ", wrote=" + wl.written));
          wl.written = 0;
        }
        kv.updateLatestStamp(this.now);//
        wl.writer.append(kv);
        wl.written += length;
        // Copy the row so we know when a row transition.
        this.previousRow = kv.getRow();
      }

      /* Create a new HFile.Writer. Close current if there is one.创建HFile的writer
       * @param writer
       * @param familydir
       * @return A new HFile.Writer.
       * @throws IOException
       */
      private HFile.Writer getNewWriter(final HFile.Writer writer,
          final Path familydir)
      throws IOException {
        close(writer);
        return new HFile.Writer(fs,  StoreFile.getUniqueFile(fs, familydir),
          blocksize, compression, KeyValue.KEY_COMPARATOR);
      }

      private void close(final HFile.Writer w) throws IOException {
        if (w != null) {
          w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
              Bytes.toBytes(System.currentTimeMillis()));
          w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
              Bytes.toBytes(context.getTaskAttemptID().toString()));
          w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, 
              Bytes.toBytes(true));
          w.close();
        }
      }

      public void close(TaskAttemptContext c)
      throws IOException, InterruptedException {
        for (Map.Entry e: this.writers.entrySet()) {
          close(e.getValue().writer);
        }
      }
    };
  }

  /*
   * Data structure to hold a Writer and amount of data written on it.
   */
  static class WriterLength {
    long written = 0;
    HFile.Writer writer = null;
  }

  /**
   * Return the start keys of all of the regions in this table,返回region的起始key
   * as a list of ImmutableBytesWritable.
   */
  private static List getRegionStartKeys(HTable table)
  throws IOException {
    byte[][] byteKeys = table.getStartKeys();
    ArrayList ret =
      new ArrayList(byteKeys.length);
    for (byte[] byteKey : byteKeys) {
      ret.add(new ImmutableBytesWritable(byteKey));
    }
    return ret;
  }

  /**
   * Write out a SequenceFile that can be read by TotalOrderPartitioner序列化写入分片
   * that contains the split points in startKeys.
   * @param partitionsPath output path for SequenceFile
   * @param startKeys the region start keys
   */
  private static void writePartitions(Configuration conf, Path partitionsPath,
      List startKeys) throws IOException {
    if (startKeys.isEmpty()) {
      throw new IllegalArgumentException("No regions passed");
    }

    // We're generating a list of split points, and we don't ever
    // have keys < the first region (which has an empty start key)
    // so we need to remove it. Otherwise we would end up with an
    // empty reducer with index 0
    TreeSet sorted =
      new TreeSet(startKeys);

    ImmutableBytesWritable first = sorted.first();
    if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
      throw new IllegalArgumentException(
          "First region of table should have empty start key. Instead has: "
          + Bytes.toStringBinary(first.get()));
    }
    sorted.remove(first);
    
    // Write the actual file
    FileSystem fs = partitionsPath.getFileSystem(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, 
        conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
    
    try {
      for (ImmutableBytesWritable startKey : sorted) {
        writer.append(startKey, NullWritable.get());
      }
    } finally {
      writer.close();
    }
  }
  
  /**
   * Configure a MapReduce Job to perform an incremental load into the given//什么意思?
   * table. This
   * 
    *
  • Inspects the table to configure a total order partitioner
  • *
  • Uploads the partitions file to the cluster and adds it to the DistributedCache
  • *
  • Sets the number of reduce tasks to match the current number of regions
  • *
  • Sets the output key/value class to match HFileOutputFormat's requirements
  • *
  • Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or * PutSortReducer)
  • *
* The user should be sure to set the map output value class to either KeyValue or Put before * running this function. */ public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗 Configuration conf = job.getConfiguration(); job.setPartitionerClass(TotalOrderPartitioner.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(HFileOutputFormat.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(PutSortReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } LOG.info("Looking up current regions for table " + table); List startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionsPath); FileSystem fs = partitionsPath.getFileSystem(conf); writePartitions(conf, partitionsPath, startKeys); partitionsPath.makeQualified(fs); URI cacheUri; try { cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.addCacheFile(cacheUri, conf); DistributedCache.createSymlink(conf); LOG.info("Incremental table output configured."); } }
下面给出一个示例:

1. 创建HBase表t1

hbase(main):157:0* create 't1','f1' 
0 row(s) in 1.3280 seconds 
 
hbase(main):158:0> scan 't1' 
ROW                   COLUMN+CELL                                                
0 row(s) in 1.2770 seconds 
2.写MR作业
HBaseHFileMapper.java

package com.test.hfile; 
import java.io.IOException; 
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 
import org.apache.hadoop.hbase.util.Bytes; 
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Mapper; 
 
public class HBaseHFileMapper extends Mapper { 
    private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable(); 
    @Override 
    protected void map(LongWritable key, Text value, 
            org.apache.hadoop.mapreduce.Mapper.Context context) 
            throws IOException, InterruptedException { 
        immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key
        context.write(immutableBytesWritable, value); 
    } 
} 
HBaseHFileReducer.java

package com.test.hfile; 
import java.io.IOException; 
import org.apache.hadoop.hbase.KeyValue; 
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 
import org.apache.hadoop.hbase.util.Bytes; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Reducer; 
 
public class HBaseHFileReducer extends Reducer {     
    protected void reduce(ImmutableBytesWritable key, Iterable values, 
            Context context) 
            throws IOException, InterruptedException { 
        String value=""; 
        while(values.iterator().hasNext()) 
        { 
            value = values.iterator().next().toString(); 
            if(value != null && !"".equals(value)) //不为空
            { 
                KeyValue kv = createKeyValue(value.toString());//keyValue就是value值 
                if(kv!=null) 
                    context.write(key, kv); 
            } 
        } 
    } 
    // str格式为row:family:qualifier:value 简单模拟下
    private KeyValue createKeyValue(String str) 
    { 
        String[] strstrs = str.split(":"); 
        if(strs.length<4) 
            return null; 
        String row=strs[0]; 
        String family=strs[1]; 
        String qualifier=strs[2]; 
        String value=strs[3]; 
        return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(), Bytes.toBytes(value)); 
    } 
} 
HbaseHFileDriver.java

package com.test.hfile; 
import java.io.IOException; 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.hbase.HBaseConfiguration; 
import org.apache.hadoop.hbase.client.HTable; 
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Job; 
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
import org.apache.hadoop.util.GenericOptionsParser; 
 
public class HbaseHFileDriver { 
    public static void main(String[] args) throws IOException, 
            InterruptedException, ClassNotFoundException { 
         
        Configuration conf = new Configuration(); 
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 
 
        Job job = new Job(conf, "testhbasehfile"); 
        job.setJarByClass(HbaseHFileDriver.class); //主类
 
        job.setMapperClass(com.test.hfile.HBaseHFileMapper.class); 
        job.setReducerClass(com.test.hfile.HBaseHFileReducer.class); 
 
        job.setMapOutputKeyClass(ImmutableBytesWritable.class); 
        job.setMapOutputValueClass(Text.class); 

        // 偷懒, 直接写死在程序里了,实际应用中不能这样, 应从命令行获取,无语了对这个
        FileInputFormat.addInputPath(job, new Path("/home/yinjie/input")); 
        FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output")); 
 
        Configuration HBASE_CONFIG = new Configuration(); 
        HBASE_CONFIG.set("hbase.zookeeper.quorum", "localhost"); 
        HBASE_CONFIG.set("hbase.zookeeper.property.clientPort", "2181"); 
        HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG); 
        String tableName = "t1"; 
        HTable htable = new HTable(cfg, tableName); 
        HFileOutputFormat.configureIncrementalLoad(job, htable); 
 
        System.exit(job.waitForCompletion(true) ? 0 : 1); 
    } 
} 
/home/yinjie/input目录下有一个hbasedata.txt文件,内容为
[root@localhost input]# cat hbasedata.txt  
r1:f1:c1:value1 
r2:f1:c2:value2 
r3:f1:c3:value3 
将作业打包,我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jar com.test.hfile.HbaseHFileDriver -libjars 

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

  1. [root@localhost input]# hadoop fs -ls /home/yinjie/output 
  2. Found 2 items 
  3. drwxr-xr-x   - root supergroup          0 2011-08-28 21:02 /home/yinjie/output/_logs 
  4. drwxr-xr-x   - root supergroup          0 2011-08-28 21:03 /home/yinjie/output/f1 
OK, 已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar /home/yinjie/hbase-0.90.3/hbase-0.90.3.jar completebulkload

导入完毕,查询hbase表t1进行验证

  1. hbase(main):166:0> scan 't1' 
  2. ROW                              COLUMN+CELL                                                                                  
  3.  r1                              column=f1:c1, timestamp=1314591150788value=value1                                          
  4.  r2                              column=f1:c2, timestamp=1314591150814value=value2                                          
  5.  r3                              column=f1:c3, timestamp=1314591150815value=value3                                          
  6. 3 row(s) in 0.0210 seconds 

数据已经导入!

本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244


你可能感兴趣的:(Hadoop,数据库,Hbase)