小试牛刀,将mapreduce的输出结果保存到大型分布式数据库中HBase中,一个例子,求各url的访问pv数据,由于用到rcfile格式需要导入hive-exce包,还需要加载hbase包,如果这两个包都已经被集群管理员放到各节点的hadoop/lib下那就可以省去这一步,废话不说,干货,看代码:
package test.hbase; import java.io.IOException; import java.util.HashSet; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.hbase.mapreduce.TableReducer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import com.sohu.tv.dm.common.RCFileInputFormat; public class URLCountHbase { public static class HBaseMap extends Mapper<LongWritable, BytesRefArrayWritable, Text, IntWritable> { private IntWritable i = new IntWritable(1); @Override protected void map(LongWritable key, BytesRefArrayWritable value, Context context) throws IOException, InterruptedException { byte[] url = value.get(4).getBytesCopy(); context.write(new Text(url), i); } } public static class HBaseReduce extends TableReducer<Text, IntWritable, NullWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable i : values) { sum += i.get(); } Put put = new Put(Bytes.toBytes(key.toString())); put.add(Bytes.toBytes("type"), Bytes.toBytes("count"), Bytes.toBytes(String.valueOf(sum))); context.write(NullWritable.get(), put); } } public static void createHbaseTable(String tablename) throws IOException { HTableDescriptor htd = new HTableDescriptor(tablename); HColumnDescriptor col = new HColumnDescriptor("type"); htd.addFamily(col); HBaseConfiguration config = new HBaseConfiguration(); HBaseAdmin admin = new HBaseAdmin(config); if (admin.tableExists(tablename)) { System.out.println("table exists, trying recreate table"); admin.disableTable(tablename); admin.deleteTable(tablename); } System.out.println("create new table:" + tablename); admin.createTable(htd); } public static void main(String args[]) throws Exception { String tablename = "urlcount"; Configuration conf = new Configuration(); final FileSystem fs = FileSystem.getLocal(conf); final HashSet<String> localfiles = new HashSet<String>(); localfiles.add("/opt/hadoop/hive-0.8.1/lib/hive-exec-0.8.1.jar"); localfiles.add("/opt/hadoop/hbase/hbase-0.92.1.jar"); final HashSet<String> files = new HashSet<String>(); for (String s : localfiles) { files.add(URLCountHbase.convertPath(s, fs)); } URLCountHbase.cacheJars(conf, files); conf.set(TableOutputFormat.OUTPUT_TABLE, tablename); createHbaseTable(tablename); Job job = new Job(conf, "WordCount table with " + args[0]); job.setJarByClass(URLCountHbase.class); job.setNumReduceTasks(3); job.setReducerClass(HBaseReduce.class); job.setMapperClass(HBaseMap.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(TableOutputFormat.class); job.setInputFormatClass(RCFileInputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); System.exit(job.waitForCompletion(true) ? 0 : 1); } private static String convertPath(String path, FileSystem fs) { final Path p = new Path(path); return p.makeQualified(fs).toString(); } private static void cacheJars(Configuration job, Set<String> localUrls) throws IOException { if (localUrls.isEmpty()) { return; } final String tmpjars = job.get("tmpjars"); final StringBuilder sb = new StringBuilder(); if (null != tmpjars) { sb.append(tmpjars); sb.append(","); } sb.append(org.apache.hadoop.util.StringUtils.arrayToString(localUrls.toArray(new String[0]))); job.set("tmpjars", sb.toString()); } }