HBase可以随机读写海量的数据,但是如果把这海量数据导入到HBase却是一个挑战。如,将Hive表尽可能快的导入到HBase中。这里有以下三种解决方案:
public class Driver extends Configured implements Tool{ private static Configuration conf = new Configuration(); private static Configuration hconf = null; private static HBaseAdmin hadmin = null; public static void connectHBase(){ final String HBASE_CONFIG_ZOOKEEPER_CLIENT = "hbase.zookeeper.property.clientPort"; final String HBASE_ZOOKEEPER_CLIENT_PORT = "2181"; final String HBASE_CONFIG_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum"; final String HBASE_ZOOKEEPER_SERVER = "hbase38,hbase43,hbase00"; conf.set(HBASE_CONFIG_ZOOKEEPER_CLIENT, HBASE_ZOOKEEPER_CLIENT_PORT); conf.set(HBASE_CONFIG_ZOOKEEPER_QUORUM, HBASE_ZOOKEEPER_SERVER); hconf = HBaseConfiguration.create(conf); try{ hadmin = new HBaseAdmin(hconf); } catch (Exception e){ e.printStackTrace(); } } public static void main(String[] args)throws Exception{ String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if(otherArgs.length != 4){ System.err.println("Usage: <rcfile> <hfile> <schemafile> <hbasetable>"); System.exit(1); } String path = System.getProperty("user.dir") + otherArgs[2]; List<String> fieldNames = HiveTableUtils.getFieldName(path); StringBuilder sb = new StringBuilder(fieldNames.get(0)); int size = fieldNames.size(); for(int i = 1; i < size; i++){ sb.append(":").append(fieldNames.get(i)); } conf.set("schema", sb.toString()); if(ToolRunner.run(conf, new Driver(), otherArgs) == 0){ // Importing the generated HFiles into a HBase table LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); loader.doBulkLoad(new Path(otherArgs[1], otherArgs[3]); System.exit(0); } else{ System.exit(1); } } @SuppressWarnings("deprecation") @Override public int run(String[] strings) throws Exception { Configuration config = getConf(); Driver.connectHBase(); Job job = new Job(config, "RCFile to HFile"); job.setJarByClass(Driver.class); job.setMapperClass(RCFileToHFile.ParseMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); //Reduce's number is 0. job.setNumReduceTasks(0); job.setPartitionerClass(SimpleTotalOrderPartitioner.class); job.setInputFormatClass(RCFileMapReduceInputFormat.class); // job.setOutputFormatClass(HFileOutputFormat.class); HTable table = new HTable(config, strings[3]); HFileOutputFormat.configureIncrementalLoad(job, table); RCFileMapReduceInputFormat.addInputPath(job, new Path(strings[0])); FileOutputFormat.setOutputPath(job, new Path(strings[1])); return job.waitForCompletion(true) ? 0 : 1; } }
public class Driver extends Configured implements Tool{ private static Configuration conf = new Configuration(); private static Configuration hconf = null; private static HBaseAdmin hadmin = null; public static void connectHBase(){ final String HBASE_CONFIG_ZOOKEEPER_CLIENT = "hbase.zookeeper.property.clientPort"; final String HBASE_ZOOKEEPER_CLIENT_PORT = "2181"; final String HBASE_CONFIG_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum"; final String HBASE_ZOOKEEPER_SERVER = "hbase38,hbase43,hbase00"; conf.set(HBASE_CONFIG_ZOOKEEPER_CLIENT, HBASE_ZOOKEEPER_CLIENT_PORT); conf.set(HBASE_CONFIG_ZOOKEEPER_QUORUM, HBASE_ZOOKEEPER_SERVER); hconf = HBaseConfiguration.create(conf); try{ hadmin = new HBaseAdmin(hconf); } catch (Exception e){ e.printStackTrace(); } } public static void main(String[] args)throws Exception{ String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if(otherArgs.length != 4){ System.err.println("Usage: <rcfile> <hfile> <schemafile> <hbasetable>"); System.exit(1); } String path = System.getProperty("user.dir") + otherArgs[2]; List<String> fieldNames = HiveTableUtils.getFieldName(path); StringBuilder sb = new StringBuilder(fieldNames.get(0)); int size = fieldNames.size(); for(int i = 1; i < size; i++){ sb.append(":").append(fieldNames.get(i)); } conf.set("schema", sb.toString()); System.exit(ToolRunner.run(conf, new Driver(), otherArgs)); } @SuppressWarnings("deprecation") @Override public int run(String[] strings) throws Exception { Configuration config = getConf(); Driver.connectHBase(); Job job = new Job(config, "RCFile to HFile"); job.setJarByClass(Driver.class); job.setMapperClass(RCFileToHFile.ParseMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); //Reduce's number is 0. job.setNumReduceTasks(0); job.setPartitionerClass(SimpleTotalOrderPartitioner.class); job.setInputFormatClass(RCFileMapReduceInputFormat.class); // job.setOutputFormatClass(HFileOutputFormat.class); HTable table = new HTable(config, strings[3]); HFileOutputFormat.configureIncrementalLoad(job, table); RCFileMapReduceInputFormat.addInputPath(job, new Path(strings[0])); FileOutputFormat.setOutputPath(job, new Path(strings[1])); return job.waitForCompletion(true) ? 0 : 1; } }
# Distributed copy HFile to mycluster-hbase. hadoop distcp hdfs://mycluster-hive/hfile/hbase hdfs://mycluster-hbase/hbase/test
# BulkLoad HFile into hbase table on mycluster-hbase. hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /hbase/test hbase_table
public class HiveTableUtils { //Gain hive table columns by parsing file. public static List<String> getFieldName(String filePath){ File file = new File(filePath); BufferedReader reader = null; List<String> fieldName = new ArrayList<String>(); try { if (file.exists()) { reader = new BufferedReader(new FileReader(file)); String tmp = null; while ((tmp = reader.readLine()) != null) { if (tmp.contains("`") && tmp.contains("COMMENT")) { int start = tmp.indexOf("`"); int end = tmp.lastIndexOf("`"); fieldName.add(tmp.substring(start + 1, end)); } } } else { System.err.println("The file doesn't exist!"); System.exit(1); } reader.close(); } catch (Exception e) { e.printStackTrace(); } return fieldName; }
public class RCFileToHFile { public static class ParseMapper extends Mapper<LongWritable, BytesRefArrayWritable, ImmutableBytesWritable, KeyValue>{ // private List<String> fieldName = null; private String[] fieldName = null; @Override protected void setup(Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub super.setup(context); Configuration conf = context.getConfiguration(); String schema = conf.get("schema"); fieldName = schema.split(":"); // fieldName = new ArrayList<String>(); // fieldName.add("id"); // fieldName.add("name"); // fieldName.add("age"); } @Override protected void map(LongWritable key, BytesRefArrayWritable values, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub Text line = new Text(); List<String> fields = new ArrayList<String>(); int size = values.size(); for(int i = 0; i < size; i++){ BytesRefWritable value = values.get(i); line.set(value.getData(), value.getStart(), value.getLength()); fields.add(line.toString()); } String rowKey = fields.get(0); String columnFamily = "cf"; int length = fieldName.length; ImmutableBytesWritable hKey = new ImmutableBytesWritable(); hKey.set(rowKey.getBytes()); KeyValue kv = null; for(int i = 1; i < length; i++){ kv = new KeyValue(hKey.get(), columnFamily.getBytes(), fieldName[i].getBytes(), fields.get(i).getBytes()); context.write(hKey, kv); } } } }