基于HBase过滤器MultiRowRangeFilter和mapreduce对opentsdb进行查询

问题描述
这几天在做基于mapreduce和opentsdb的开发。具体需求是:当前opentsdb中存放了大量时序数据,要求查询某一天内的所有数据并使用mapreduce进行处理。若直接全盘扫面则效率太低。曾考虑采用setStartRow和setStopRow的方式缩小扫描范围,但考虑到opentsdb的rowkey设计是:metric(6位) + timestamp (4位) + tags(3*6位),这里存在一个问题:由于rowkey的开头设计是metric而不是时间戳,因此同一天内的数据所在的行在opentsdb中并不连续,若查询范围中只有一个metric则可使用这种方法,而查询范围中包含多个metric,因此可以考虑使用hbase的MultiRowRangeFilte过滤器。关于opentsdb的信息可参考:http://www.jianshu.com/p/0bafd0168647
实现方法
1.从tsdb-uid表中读取所有的metric对应的uid。
2.根据uid和时间戳拼合目标范围的起止rowkey。(这里对tags的形式没有要求,tags位直接设为0即可。)如,uid为{-41,-20,-82,-34, -75,-107},日期为2017-09-01范围内的数据的起止rowkey为{-41,-20,-82,-34, -75,-107, 89, -88, 50, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}至{-41,-20,-82,-34, -75,-107, 89, -87, -124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}。将所有的metric在目标日期范围的岂止rowkey存放至List中,并创建MultiRowRangeFilter过滤器。
3.使用TableMapper读取tsdb中的数据(方便后续使用mapreduce进行处理)。
实现代码

public class TSDBMRRange {
    private static List ranges = new ArrayList();
    private static byte[] dummyTags = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    private static HBaseUID uidclient;
    private static TSDBParser parser;
    // 从tsdb-uid表中获取全部metric对应的uid,并根据岂止时间获取rowkey的范围
    public static void buildRowRangeList(Configuration config, Date startTime, Date endTime) throws Exception {
        Connection connection = ConnectionFactory.createConnection(config);
        Table table = connection.getTable(TableName.valueOf("tsdb-uid"));
        Scan s = new Scan();
        s.addColumn("name".getBytes(), "metrics".getBytes());
        s.setCaching(500);
        ResultScanner scanner = table.getScanner(s);
        try {
            for (Result rr = scanner.next(); rr != null; rr = scanner.next()) {
                List cells = rr.listCells();
                for (Cell cell : cells) {
                    final byte[] uid = CellUtil.cloneRow(cell);
                    RowRange range = createRow(uid, startTime.getTime(), endTime.getTime());
                    ranges.add(range);
                }
            }
        } catch(Exception ex) {
            ex.printStackTrace();
        } finally {
            scanner.close();
            if (table != null) table.close();
            connection.close();
        }
    }
    //根据uid和岂止时间拼合rowkey范围
    public static RowRange createRow(byte[] uid, long startTime, long endTime) {
        assert(parser != null);
        assert(uidclient != null);

        int row_size = (uid.length + TSDBParser.TIMESTAMP_BYTES 
                    + dummyTags.length);

        final long base_start_time = TSDBParser.baseTime(startTime);
        final long base_end_time = TSDBParser.baseTime(endTime);
        final byte[] startRow = new byte[row_size];
        final byte[] stopRow = new byte[row_size];
        int pos = 0;
        System.arraycopy(uid, 0, startRow, pos, uid.length);
        pos += uid.length;

        TSDBParser.setInt(startRow, (int) base_start_time, pos); 
        pos += TSDBParser.TIMESTAMP_BYTES;
        System.arraycopy(dummyTags, 0, startRow, pos, dummyTags.length);

        pos = 0;
        System.arraycopy(uid, 0, stopRow, pos, uid.length);
        pos += uid.length;

        TSDBParser.setInt(stopRow, (int) base_end_time, pos); 
        pos += TSDBParser.TIMESTAMP_BYTES;
        System.arraycopy(dummyTags, 0, stopRow, pos, dummyTags.length);

        return new RowRange(startRow, true, stopRow, false);
    }
    //创建mapreduce job,这里省略了mapper和reducer实现类
    public static void main(String[] args) throws Exception {
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.property.clientPort", "2181");
        config.set("zookeeper.znode.parent", "/hbase");
        config.set("hbase.zookeeper.quorum", zkquorum);

        SimpleDateFormat simpleDateFormat=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        Date startTime = simpleDateFormat.parse("2017-09-01 00:00:00");
        Date endTime = simpleDateFormat.parse("2017-09-02 00:00:00");

        String tsdbTableName = "tsdb";

        Connection connection = ConnectionFactory.createConnection(config);
        uidclient = new HBaseUID(connection, new UIDConfig("tsdb-uid", 6, 3, 3));
        parser = new TSDBParser(uidclient);
        buildRowRangeList(config, startTime, endTime);


        // build map reduce job
        config.setLong("job.starttime", startTime.getTime());
        config.setLong("job.endtime", endTime.getTime());
        Job job = Job.getInstance(config, "test");
        job.setJarByClass(TSDBMR.class);

        Scan scan = new Scan();
        scan.setCaching(500);
        scan.setCacheBlocks(false);

        MultiRowRangeFilter filter = new MultiRowRangeFilter(ranges);
        scan.setFilter(filter);

        TableMapReduceUtil.initTableMapperJob(
                tsdbTableName,          // input HBase table name
                scan,                   // Scan instance to control CF and attribute selection
                TSDBMapper.class,       // mapper
                Text.class,
                LongWritable.class,     // mapper output value
                job);
        job.setReducerClass(TimeReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path("test"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

你可能感兴趣的:(java,大数据,hbase,mapreduce)