问题描述
这几天在做基于mapreduce和opentsdb的开发。具体需求是:当前opentsdb中存放了大量时序数据,要求查询某一天内的所有数据并使用mapreduce进行处理。若直接全盘扫面则效率太低。曾考虑采用setStartRow和setStopRow的方式缩小扫描范围,但考虑到opentsdb的rowkey设计是:metric(6位) + timestamp (4位) + tags(3*6位),这里存在一个问题:由于rowkey的开头设计是metric而不是时间戳,因此同一天内的数据所在的行在opentsdb中并不连续,若查询范围中只有一个metric则可使用这种方法,而查询范围中包含多个metric,因此可以考虑使用hbase的MultiRowRangeFilte过滤器。关于opentsdb的信息可参考:http://www.jianshu.com/p/0bafd0168647
实现方法
1.从tsdb-uid表中读取所有的metric对应的uid。
2.根据uid和时间戳拼合目标范围的起止rowkey。(这里对tags的形式没有要求,tags位直接设为0即可。)如,uid为{-41,-20,-82,-34, -75,-107},日期为2017-09-01范围内的数据的起止rowkey为{-41,-20,-82,-34, -75,-107, 89, -88, 50, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}至{-41,-20,-82,-34, -75,-107, 89, -87, -124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}。将所有的metric在目标日期范围的岂止rowkey存放至List中,并创建MultiRowRangeFilter过滤器。
3.使用TableMapper读取tsdb中的数据(方便后续使用mapreduce进行处理)。
实现代码
public class TSDBMRRange {
private static List ranges = new ArrayList();
private static byte[] dummyTags = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
private static HBaseUID uidclient;
private static TSDBParser parser;
// 从tsdb-uid表中获取全部metric对应的uid,并根据岂止时间获取rowkey的范围
public static void buildRowRangeList(Configuration config, Date startTime, Date endTime) throws Exception {
Connection connection = ConnectionFactory.createConnection(config);
Table table = connection.getTable(TableName.valueOf("tsdb-uid"));
Scan s = new Scan();
s.addColumn("name".getBytes(), "metrics".getBytes());
s.setCaching(500);
ResultScanner scanner = table.getScanner(s);
try {
for (Result rr = scanner.next(); rr != null; rr = scanner.next()) {
List cells = rr.listCells();
for (Cell cell : cells) {
final byte[] uid = CellUtil.cloneRow(cell);
RowRange range = createRow(uid, startTime.getTime(), endTime.getTime());
ranges.add(range);
}
}
} catch(Exception ex) {
ex.printStackTrace();
} finally {
scanner.close();
if (table != null) table.close();
connection.close();
}
}
//根据uid和岂止时间拼合rowkey范围
public static RowRange createRow(byte[] uid, long startTime, long endTime) {
assert(parser != null);
assert(uidclient != null);
int row_size = (uid.length + TSDBParser.TIMESTAMP_BYTES
+ dummyTags.length);
final long base_start_time = TSDBParser.baseTime(startTime);
final long base_end_time = TSDBParser.baseTime(endTime);
final byte[] startRow = new byte[row_size];
final byte[] stopRow = new byte[row_size];
int pos = 0;
System.arraycopy(uid, 0, startRow, pos, uid.length);
pos += uid.length;
TSDBParser.setInt(startRow, (int) base_start_time, pos);
pos += TSDBParser.TIMESTAMP_BYTES;
System.arraycopy(dummyTags, 0, startRow, pos, dummyTags.length);
pos = 0;
System.arraycopy(uid, 0, stopRow, pos, uid.length);
pos += uid.length;
TSDBParser.setInt(stopRow, (int) base_end_time, pos);
pos += TSDBParser.TIMESTAMP_BYTES;
System.arraycopy(dummyTags, 0, stopRow, pos, dummyTags.length);
return new RowRange(startRow, true, stopRow, false);
}
//创建mapreduce job,这里省略了mapper和reducer实现类
public static void main(String[] args) throws Exception {
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.property.clientPort", "2181");
config.set("zookeeper.znode.parent", "/hbase");
config.set("hbase.zookeeper.quorum", zkquorum);
SimpleDateFormat simpleDateFormat=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date startTime = simpleDateFormat.parse("2017-09-01 00:00:00");
Date endTime = simpleDateFormat.parse("2017-09-02 00:00:00");
String tsdbTableName = "tsdb";
Connection connection = ConnectionFactory.createConnection(config);
uidclient = new HBaseUID(connection, new UIDConfig("tsdb-uid", 6, 3, 3));
parser = new TSDBParser(uidclient);
buildRowRangeList(config, startTime, endTime);
// build map reduce job
config.setLong("job.starttime", startTime.getTime());
config.setLong("job.endtime", endTime.getTime());
Job job = Job.getInstance(config, "test");
job.setJarByClass(TSDBMR.class);
Scan scan = new Scan();
scan.setCaching(500);
scan.setCacheBlocks(false);
MultiRowRangeFilter filter = new MultiRowRangeFilter(ranges);
scan.setFilter(filter);
TableMapReduceUtil.initTableMapperJob(
tsdbTableName, // input HBase table name
scan, // Scan instance to control CF and attribute selection
TSDBMapper.class, // mapper
Text.class,
LongWritable.class, // mapper output value
job);
job.setReducerClass(TimeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path("test"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
} |