Hbase基于Mapreduce的编程

小试牛刀,将mapreduce的输出结果保存到大型分布式数据库中HBase中,一个例子,求各url的访问pv数据,由于用到rcfile格式需要导入hive-exce包,还需要加载hbase包,如果这两个包都已经被集群管理员放到各节点的hadoop/lib下那就可以省去这一步,废话不说,干货,看代码:

package test.hbase;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import com.sohu.tv.dm.common.RCFileInputFormat;

public class URLCountHbase {
	public static class HBaseMap extends
			Mapper<LongWritable, BytesRefArrayWritable, Text, IntWritable> {

		private IntWritable i = new IntWritable(1);

		@Override
		protected void map(LongWritable key, BytesRefArrayWritable value,
				Context context) throws IOException, InterruptedException {
			byte[] url = value.get(4).getBytesCopy();
			context.write(new Text(url), i);
		}

	}

	public static class HBaseReduce extends
			TableReducer<Text, IntWritable, NullWritable> {

		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable i : values) {
				sum += i.get();
			}
			Put put = new Put(Bytes.toBytes(key.toString()));
			put.add(Bytes.toBytes("type"), Bytes.toBytes("count"),
					Bytes.toBytes(String.valueOf(sum)));
			context.write(NullWritable.get(), put);
		}

	}

	public static void createHbaseTable(String tablename) throws IOException {
		HTableDescriptor htd = new HTableDescriptor(tablename);
		HColumnDescriptor col = new HColumnDescriptor("type");
		htd.addFamily(col);
		HBaseConfiguration config = new HBaseConfiguration();
		HBaseAdmin admin = new HBaseAdmin(config);
		if (admin.tableExists(tablename)) {
			System.out.println("table exists, trying recreate table");
			admin.disableTable(tablename);
			admin.deleteTable(tablename);
		}
		System.out.println("create new table:" + tablename);
		admin.createTable(htd);

	}

	public static void main(String args[]) throws Exception {
		String tablename = "urlcount";
		Configuration conf = new Configuration();
		final FileSystem fs = FileSystem.getLocal(conf);
		final HashSet<String> localfiles = new HashSet<String>();
		localfiles.add("/opt/hadoop/hive-0.8.1/lib/hive-exec-0.8.1.jar");
		localfiles.add("/opt/hadoop/hbase/hbase-0.92.1.jar");
		final HashSet<String> files = new HashSet<String>();
		for (String s : localfiles) {
			files.add(URLCountHbase.convertPath(s, fs));
		}
		URLCountHbase.cacheJars(conf, files);
		conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);
		createHbaseTable(tablename);
		Job job = new Job(conf, "WordCount table with " + args[0]);
		job.setJarByClass(URLCountHbase.class);
		job.setNumReduceTasks(3);
		job.setReducerClass(HBaseReduce.class);
		job.setMapperClass(HBaseMap.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputFormatClass(TableOutputFormat.class);
		job.setInputFormatClass(RCFileInputFormat.class);
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}
	
	 private static String convertPath(String path, FileSystem fs) {
	        final Path p = new Path(path);
	        return p.makeQualified(fs).toString();
	 }

	 private static void cacheJars(Configuration job, Set<String> localUrls) throws IOException {
	     if (localUrls.isEmpty()) {
	            return;
	        }
	        final String tmpjars = job.get("tmpjars");
	        final StringBuilder sb = new StringBuilder();
	        if (null != tmpjars) {
	            sb.append(tmpjars);
	            sb.append(",");
	        }
	        sb.append(org.apache.hadoop.util.StringUtils.arrayToString(localUrls.toArray(new String[0])));
	        job.set("tmpjars", sb.toString());
	  }
}


 

你可能感兴趣的:(Hbase基于Mapreduce的编程)