HBase为筛选数据提供了一组过滤器,通过这个过滤器可以在HBase中的数据的多个维度(行,列,数据版本)上进行对数据的筛选操作,也就是说过滤器最终能够筛选的数据能够细化到具体的一个存储单元格上(由行键,列明,时间戳定位)。通常来说,通过行键,值来筛选数据的应用场景较多。
1. RowFilter:筛选出匹配的所有的行,对于这个过滤器的应用场景,是非常直观的:使用BinaryComparator可以筛选出具有某个行键的行,或者通过改变比较运算符(下面的例子中是CompareFilter.CompareOp.EQUAL)来筛选出符合某一条件的多条数据,以下就是筛选出行键为row1的一行数据:
Filter rf = new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes("row1"))); // OK 筛选出匹配的所有的行
Filter pf = new PrefixFilter(Bytes.toBytes("row")); // OK 筛选匹配行键的前缀成功的行
Filter kof = new KeyOnlyFilter(); // OK 返回所有的行,但值全是空
Filter rrf = new RandomRowFilter((float) 0.8); // OK 随机选出一部分的行
Filter isf = new InclusiveStopFilter(Bytes.toBytes("row1")); // OK 包含了扫描的上限在结果之内
Filter fkof = new FirstKeyOnlyFilter(); // OK 筛选出第一个每个第一个单元格
Filter cpf = new ColumnPrefixFilter(Bytes.toBytes("qual1")); // OK 筛选出前缀匹配的列
Filter vf = new ValueFilter(CompareFilter.CompareOp.EQUAL, new SubstringComparator("ROW2_QUAL1")); // OK 筛选某个(值的条件满足的)特定的单元格
Filter ccf = new ColumnCountGetFilter(2); // OK 如果突然发现一行中的列数超过设定的最大值时,整个扫描操作会停止
SingleColumnValueFilter scvf = new SingleColumnValueFilter( Bytes.toBytes("colfam1"), Bytes.toBytes("qual2"), CompareFilter.CompareOp.NOT_EQUAL, new SubstringComparator("BOGUS")); scvf.setFilterIfMissing(false); scvf.setLatestVersionOnly(true); // OK
12. SkipFilter:这是一种附加过滤器,其与ValueFilter结合使用,如果发现一行中的某一列不符合条件,那么整行就会被过滤掉:
Filter skf = new SkipFilter(vf); // OK 发现某一行中的一列需要过滤时,整个行就会被过滤掉
Filter wmf = new WhileMatchFilter(rf); // OK 类似于Python itertools中的takewhile
List<Filter> filters = new ArrayList<Filter>(); filters.add(rf); filters.add(vf); FilterList fl = new FilterList(FilterList.Operator.MUST_PASS_ALL, filters); // OK 综合使用多个过滤器, AND 和 OR 两种关系
package com.reyun.hbase; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.util.Bytes; public class HBaseDataFeeding { private final static byte[] ROW1 = Bytes.toBytes("row1"); private final static byte[] ROW2 = Bytes.toBytes("row2"); private final static byte[] COLFAM1 = Bytes.toBytes("colfam1"); private final static byte[] COLFAM2 = Bytes.toBytes("colfam2"); private final static byte[] QUAL1 = Bytes.toBytes("qual1"); private final static byte[] QUAL2 = Bytes.toBytes("qual2"); public static void main(String[] args) throws IOException { Configuration conf = HBaseConfiguration.create(); HTable table = new HTable(conf, "testtable"); table.setAutoFlushTo(false); Put put_row1 = new Put(ROW1); put_row1.add(COLFAM1, QUAL1, Bytes.toBytes("ROW1_QUAL1_VAL")); put_row1.add(COLFAM1, QUAL2, Bytes.toBytes("ROW1_QUAL2_VAL")); Put put_row2 = new Put(ROW2); put_row2.add(COLFAM1, QUAL1, Bytes.toBytes("ROW2_QUAL1_VAL")); put_row2.add(COLFAM1, QUAL2, Bytes.toBytes("ROW2_QUAL2_VAL")); try{ table.put(put_row1); table.put(put_row2); }finally{ table.close(); } } }
package com.reyun.hbase; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.BinaryComparator; import org.apache.hadoop.hbase.filter.ColumnCountGetFilter; import org.apache.hadoop.hbase.filter.ColumnPrefixFilter; import org.apache.hadoop.hbase.filter.CompareFilter; import org.apache.hadoop.hbase.filter.Filter; import org.apache.hadoop.hbase.filter.FilterList; import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter; import org.apache.hadoop.hbase.filter.InclusiveStopFilter; import org.apache.hadoop.hbase.filter.KeyOnlyFilter; import org.apache.hadoop.hbase.filter.PageFilter; import org.apache.hadoop.hbase.filter.PrefixFilter; import org.apache.hadoop.hbase.filter.RandomRowFilter; import org.apache.hadoop.hbase.filter.RowFilter; import org.apache.hadoop.hbase.filter.SkipFilter; import org.apache.hadoop.hbase.filter.ValueFilter; import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; import org.apache.hadoop.hbase.filter.SubstringComparator; import org.apache.hadoop.hbase.filter.WhileMatchFilter; import org.apache.hadoop.hbase.util.Bytes; public class HBaseScannerTest { public static void main(String[] args) throws IOException, IllegalAccessException { Configuration conf = HBaseConfiguration.create(); HTable table = new HTable(conf, "testtable"); table.setAutoFlushTo(false); Scan scan1 = new Scan(); SingleColumnValueFilter scvf = new SingleColumnValueFilter( Bytes.toBytes("colfam1"), Bytes.toBytes("qual2"), CompareFilter.CompareOp.NOT_EQUAL, new SubstringComparator("BOGUS")); scvf.setFilterIfMissing(false); scvf.setLatestVersionOnly(true); // OK Filter ccf = new ColumnCountGetFilter(2); // OK 如果突然发现一行中的列数超过设定的最大值时,整个扫描操作会停止 Filter vf = new ValueFilter(CompareFilter.CompareOp.EQUAL, new SubstringComparator("ROW2_QUAL1")); // OK 筛选某个(值的条件满足的)特定的单元格 Filter cpf = new ColumnPrefixFilter(Bytes.toBytes("qual2")); // OK 筛选出前缀匹配的列 Filter fkof = new FirstKeyOnlyFilter(); // OK 筛选出第一个每个第一个单元格 Filter isf = new InclusiveStopFilter(Bytes.toBytes("row1")); // OK 包含了扫描的上限在结果之内 Filter rrf = new RandomRowFilter((float) 0.8); // OK 随机选出一部分的行 Filter kof = new KeyOnlyFilter(); // OK 返回所有的行,但值全是空 Filter pf = new PrefixFilter(Bytes.toBytes("row")); // OK 筛选匹配行键的前缀成功的行 Filter rf = new RowFilter(CompareFilter.CompareOp.NOT_EQUAL, new BinaryComparator(Bytes.toBytes("row1"))); // OK 筛选出匹配的所有的行 Filter wmf = new WhileMatchFilter(rf); // OK 类似于Python itertools中的takewhile Filter skf = new SkipFilter(vf); // OK 发现某一行中的一列需要过滤时,整个行就会被过滤掉 List<Filter> filters = new ArrayList<Filter>(); filters.add(rf); filters.add(vf); FilterList fl = new FilterList(FilterList.Operator.MUST_PASS_ALL, filters); // OK 综合使用多个过滤器, AND 和 OR 两种关系 scan1. setStartRow(Bytes.toBytes("row1")). setStopRow(Bytes.toBytes("row3")). setFilter(scvf); ResultScanner scanner1 = table.getScanner(scan1); for(Result res : scanner1){ for(Cell cell : res.rawCells()){ System.out.println("KV: " + cell + ", Value: " + Bytes.toString(CellUtil.cloneValue(cell))); } System.out.println("------------------------------------------------------------"); } scanner1.close(); table.close(); } }