我们知道Hbase不能像关系型数据库那样根据字段的值来查询其他的值,那么可以为hbase建立二级索引来达到这样的效果
Hbase可以用协处理器来建二级索引的,但我这里为了测试方便,直接用Java API往hbase表中插入数据的时候同时也往二级索引表中插入数据
我设计的原表数据结构是:
rowkey:待定
列族1:jiben 列:dbopt,probeid,type,size
列族2:disk 列:rate,total,temperature
列族3:cpu1 列:rate,num,temperature
列族4:cpu2 列:rate,num,temperature
列族5:memory 列:totoal,rate
二级索引表数据结构是:
rowkey:原表的value-原表的列族-原表的列-UUID
如:20095555-disk-total-fe7f4fe8d3ef4e60b1056e9c46a25e88
列族名和列名随意起
首先根据我上面的信息创建两张表,一张原始数据表emp,一张二级索引表hui
往原始索引表中插入数据的同时将列值作为二级索引表的rowkey也往二级索引表中插入数据:
import java.io.IOException;
import java.util.Random;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
public class PutData {
public static void main(String[] args) throws MasterNotRunningException,
ZooKeeperConnectionException, IOException {
String tableName = "emp";
String columnFamily1 = "jiben";
String columnFamily2 = "disk";
String columnFamily3 = "cpu1";
String columnFamily4 = "cpu2";
String columnFamily5 = "memory";
Random random = new Random();
String tableName2 = "hui";
UUID uuid = UUID.randomUUID();
for(int i=10028;i<=500000;i++){
int a = random.nextInt(100);
int b = random.nextInt(100);
int c = random.nextInt(1000);
String str = uuid.toString();
String temp = str.substring(0, 8) + str.substring(9, 13) + str.substring(14, 18)
+ str.substring(19, 23) + str.substring(24);
put(tableName, String.valueOf(i), columnFamily1, "dbopt", "insert");
put(tableName2, "insert"+"-"+columnFamily1+"-"+"dbopt"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily1, "probeid", String.valueOf(i));
put(tableName2, String.valueOf(i)+"-"+columnFamily1+"-"+"probeid"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily1, "type", "type");
put(tableName2, String.valueOf(i)+"-"+columnFamily1+"-"+"type"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily1, "size", String.valueOf(c));
put(tableName2, String.valueOf(c)+"-"+columnFamily1+"-"+"size"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily2, "rate", String.valueOf(b));
put(tableName2, String.valueOf(b)+"-"+columnFamily2+"-"+"rate"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily2, "total", "2009"+String.valueOf(i));
put(tableName2, "2009"+String.valueOf(i)+"-"+columnFamily2+"-"+"total"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily2, "temperature", String.valueOf(a));
put(tableName2, String.valueOf(a)+"-"+columnFamily2+"-"+"temperature"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily3, "rate", "0.0");
put(tableName2, "0.0"+"-"+columnFamily3+"-"+"rate"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily3, "num", String.valueOf(b));
put(tableName2, String.valueOf(b)+"-"+columnFamily3+"-"+"num"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily3, "temperature", String.valueOf(a));
put(tableName2, String.valueOf(a)+"-"+columnFamily3+"-"+"temperature"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily4, "rate", "1.0");
put(tableName2, "1.0"+"-"+columnFamily4+"-"+"rate"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily4, "num", String.valueOf(a));
put(tableName2, String.valueOf(a)+"-"+columnFamily4+"-"+"num"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily4, "temperature", String.valueOf(b));
put(tableName2, String.valueOf(b)+"-"+columnFamily4+"-"+"temperature"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily5, "totoal", String.valueOf(i));
put(tableName2, String.valueOf(i)+"-"+columnFamily5+"-"+"totoal"+"-"+temp, "cf", "rowkey", String.valueOf(i));
put(tableName, String.valueOf(i), columnFamily5, "rate", String.valueOf(a));
put(tableName2, String.valueOf(a)+"-"+columnFamily5+"-"+"rate"+"-"+temp, "cf", "rowkey", String.valueOf(i));
}
}
public static Configuration getConfiguration() {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "192.168.205.153"); // Zookeeper的地址
conf.set("hbase.zookeeper.property.clientPort", "2181");
return conf;
}
public static void put(String tableName, String row, String columnFamily,
String column, String data) throws IOException {
HTable table = new HTable(getConfiguration(), tableName);
Put put = new Put(Bytes.toBytes(row));
put.add(Bytes.toBytes(columnFamily), Bytes.toBytes(column), Bytes.toBytes(data));
table.put(put);
// System.err.println("SUCCESS");
}
}
注:这样做插的很慢,我差不多快一天了才总共插入原始表数据52327行,二级索引表214392行数据
用协处理器时需将上面的代码改一下,只插入原表emp就可以,协处理器的代码:
import java.io.IOException;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.util.Bytes;
public class TestCoprocessor4 extends BaseRegionObserver {
@Override
public void prePut(ObserverContext e,
Put put, WALEdit edit, Durability durability) throws IOException {
Configuration config = new Configuration();
@SuppressWarnings("deprecation")
HTable table = new HTable(config, "hui");
UUID uuid = UUID.randomUUID();
String str = uuid.toString();
String temp = str.substring(0, 8) + str.substring(9, 13) + str.substring(14, 18)
+ str.substring(19, 23) + str.substring(24);
byte[] row = put.getRow();
if(put.has("jiben".getBytes(), "dbopt".getBytes())){
Cell cell = put.get("jiben".getBytes(), "dbopt".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"jiben"+"-"+"dbopt"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("jiben".getBytes(), "probeid".getBytes())){
Cell cell = put.get("jiben".getBytes(), "probeid".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"jiben"+"-"+"probeid"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("jiben".getBytes(), "type".getBytes())){
Cell cell = put.get("jiben".getBytes(), "type".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"jiben"+"-"+"type"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("jiben".getBytes(), "size".getBytes())){
Cell cell = put.get("jiben".getBytes(), "size".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"jiben"+"-"+"size"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("disk".getBytes(), "rate".getBytes())){
Cell cell = put.get("disk".getBytes(), "rate".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"disk"+"-"+"rate"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("disk".getBytes(), "total".getBytes())){
Cell cell = put.get("disk".getBytes(), "total".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"disk"+"-"+"total"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("disk".getBytes(), "temperature".getBytes())){
Cell cell = put.get("disk".getBytes(), "temperature".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"disk"+"-"+"temperature"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("cpu1".getBytes(), "rate".getBytes())){
Cell cell = put.get("cpu1".getBytes(), "rate".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"cpu1"+"-"+"rate"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("cpu1".getBytes(), "num".getBytes())){
Cell cell = put.get("cpu1".getBytes(), "num".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"cpu1"+"-"+"num"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("cpu1".getBytes(), "temperature".getBytes())){
Cell cell = put.get("cpu1".getBytes(), "temperature".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"cpu1"+"-"+"temperature"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("cpu2".getBytes(), "rate".getBytes())){
Cell cell = put.get("cpu2".getBytes(), "rate".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"cpu2"+"-"+"rate"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("cpu2".getBytes(), "num".getBytes())){
Cell cell = put.get("cpu2".getBytes(), "num".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"cpu2"+"-"+"num"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("cpu2".getBytes(), "temperature".getBytes())){
Cell cell = put.get("cpu2".getBytes(), "temperature".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"cpu2"+"-"+"temperature"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("memory".getBytes(), "totoal".getBytes())){
Cell cell = put.get("memory".getBytes(), "totoal".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"memory"+"-"+"totoal"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}else if(put.has("memory".getBytes(), "rate".getBytes())){
Cell cell = put.get("memory".getBytes(), "rate".getBytes()).get(0);
String newrow = new String(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength())
+"-"+"memory"+"-"+"rate"+"-"+temp;
Put putIndex = new Put(Bytes.toBytes(newrow));
putIndex.addColumn("cf".getBytes(), "hehe".getBytes(), row);
table.put(putIndex);
table.close();
}
}
}
用hbase的get命令来查原始表的测试:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
public class GetData {
public static void main(String[] args) throws IOException {
String tableName = "emp";
long b = System.currentTimeMillis();
// get(tableName, "5555"); //直接查原始表的rowkey查整行数据
getResultByColumn(tableName, "5555", "disk", "total"); //直接定位到列
long e = System.currentTimeMillis();
System.out.println("spend:"+(e-b)+"ms");
}
public static Configuration getConfiguration() {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "192.168.205.153"); // Zookeeper的地址
conf.set("hbase.zookeeper.property.clientPort", "2181");
return conf;
}
public static void get(String tableName, String rowkey) throws IOException {
HTable table = new HTable(getConfiguration(), tableName);
Get get = new Get(Bytes.toBytes(rowkey));
Result result = table.get(get);
for (KeyValue kv : result.list()) {
System.out.println("family:" + Bytes.toString(kv.getFamily()));
System.out.println("qualifier:" + Bytes.toString(kv.getQualifier()));
System.out.println("value:" + Bytes.toString(kv.getValue()));
System.out.println("Timestamp:" + kv.getTimestamp());
System.out.println("-------------------------------------------");
}
}
public static void getResultByColumn(String tableName, String rowKey,
String familyName, String columnName) throws IOException {
HTable table = new HTable(getConfiguration(), Bytes.toBytes(tableName));
Get get = new Get(Bytes.toBytes(rowKey));
get.addColumn(Bytes.toBytes(familyName), Bytes.toBytes(columnName)); // 获取指定列族和列修饰符对应的列
Result result = table.get(get);
for (KeyValue kv : result.list()) {
System.out.println("family:" + Bytes.toString(kv.getFamily()));
System.out.println("qualifier:" + Bytes.toString(kv.getQualifier()));
System.out.println("value:" + Bytes.toString(kv.getValue()));
System.out.println("Timestamp:" + kv.getTimestamp());
System.out.println("-------------------------------------------");
}
}
}
在myeclipse中运行上面的程序所需要的时间:
在Linux本地中:
spend:1746ms
spend:1605ms
spend:1644ms
scan方法配合过滤器的效果:
hbase(main):067:0> scan 'emp', {FILTER => "RowFilter(=,'binary:5555')"}
ROW COLUMN+CELL
5555 column=cpu1:num, timestamp=1511203589032, value=80
5555 column=cpu1:rate, timestamp=1511203589016, value=0.0
5555 column=cpu1:temperature, timestamp=1511203589056, value=13
5555 column=cpu2:num, timestamp=1511203589093, value=13
5555 column=cpu2:rate, timestamp=1511203589075, value=1.0
5555 column=cpu2:temperature, timestamp=1511203589113, value=80
5555 column=disk:rate, timestamp=1511203588960, value=80
5555 column=disk:temperature, timestamp=1511203588996, value=13
5555 column=disk:total, timestamp=1511203588978, value=20095555
5555 column=jiben:dbopt, timestamp=1511203588888, value=insert
5555 column=jiben:probeid, timestamp=1511203588908, value=5555
5555 column=jiben:size, timestamp=1511203588943, value=374
5555 column=jiben:type, timestamp=1511203588925, value=type
5555 column=memory:rate, timestamp=1511203589149, value=13
5555 column=memory:totoal, timestamp=1511203589131, value=5555
1 row(s) in 0.7600 seconds
API:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.util.Bytes;
public class ScanData {
public static Admin admin = null;
public static Connection conn = null;
public ScanData() {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "192.168.205.153"); // Zookeeper的地址
conf.set("hbase.zookeeper.property.clientPort", "2181");
try {
conn = ConnectionFactory.createConnection(conf);
admin = conn.getAdmin();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
ScanData hbase = new ScanData();
// long b = System.currentTimeMillis();
// hbase.RowFilter("emp","5555");
// long e = System.currentTimeMillis();
// System.out.println("spend:"+(e-b)+"ms");
long c = System.currentTimeMillis();
hbase.PrefixFilter("hui", "20095555"+"-"+"disk"+"-"+"total");
// hbase.RowFilter1("hui", "20095555-disk-total-fe7f4fe8d3ef4e60b1056e9c46a25e88");
long d = System.currentTimeMillis();
System.out.println("spend:"+(d-c)+"ms");
}
public void RowFilter(String tableName, String reg) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName));
Scan scan = new Scan();
Filter filter = new RowFilter(CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(reg)));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
for (Result r : scanner) {
for (Cell cell : r.rawCells()) {
System.out.println(
"Rowkey-->"+Bytes.toString(r.getRow())+" "+
"Familiy:Quilifier-->"+""+Bytes.toString(CellUtil.cloneQualifier(cell))+" "+
"Value-->"+Bytes.toString(CellUtil.cloneValue(cell)));
}
}
}
public void RowFilter1(String tableName, String reg) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName));
Scan scan = new Scan();
Filter filter = new RowFilter(CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(reg)));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
for (Result r : scanner) {
for (Cell cell : r.rawCells()) {
System.out.println(
"Rowkey-->"+Bytes.toString(r.getRow())+" "+
"Familiy:Quilifier-->"+Bytes.toString(CellUtil.cloneQualifier(cell))+" "+
"Value-->"+Bytes.toString(CellUtil.cloneValue(cell)));
ScanData hbase = new ScanData();
hbase.RowFilter("emp",Bytes.toString(CellUtil.cloneValue(cell)));
}
}
}
public void PrefixFilter(String tableName, String reg) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName));
Scan scan = new Scan();
Filter filter = new PrefixFilter(Bytes.toBytes(reg));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
for (Result r : scanner) {
for (Cell cell : r.rawCells()) {
System.out.println(
"Rowkey-->"+Bytes.toString(r.getRow())+" "+
"Familiy:Quilifier-->"+Bytes.toString(CellUtil.cloneQualifier(cell))+" "+
"Value-->"+Bytes.toString(CellUtil.cloneValue(cell)));
ScanData hbase = new ScanData();
hbase.RowFilter("emp",Bytes.toString(CellUtil.cloneValue(cell)));
}
}
}
}
myeclipse中:
Linux本地中:
spend:1014ms
spend:1036ms
先查二级索引表的rowkey(20095555-disk-total-fe7f4fe8d3ef4e60b1056e9c46a25e88)再查原表:
myeclipse中:
spend:6136ms
spend:6064ms
Linux本地中:
spend:1549ms
spend:1333ms
spend:1260ms
先查二级索引表的rowkey前缀(20095555-disk-total)再查原表:
myeclipse中:
spend:5574ms
spend:5573ms
Linux本地中:
spend:1067ms
spend:1197ms
spend:1083ms
我这样做只是实现了功能,但效率好像并不是很好(如果数据量很大的话二级索引表就太浪费资源了)