可以高效的判断元素w是否在集合A之中
布隆过滤器由一个长度为N的01数组array组成,首先将数组array每个元素设置为0,对集合A中的每个元素w, 做k次哈希, 每一次hash后对N取模得到一个index(i), 即index(i) = HASH_I(W)%N, 将array数组中的array[index(i)]设置为1, 最终array变为一个某些元素为1的01数组
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vHjNSR0U-1571818862738)(EA6327242322421BBB3682288D6784ED)]
{x,y,z}三个元素,k=3,所以一共存储9个index, 每个对应的index上值为1,最终数组中存储的结果就是由01元素组成
package org.apache.minibase;
import java.io.IOException;
/**
* Bytes类,比较底层的方法
*/
public class Bytes {
//空数组
public final static byte[] EMPTY_BYTES = new byte[0];
//十六进制字符串
public final static String HEX_TMP = "0123456789ABCDEF";
//toBytes方法-将byte转为byte[]数组
public static byte[] toBytes(byte b) {
return new byte[] { b };
}
public static byte[] toBytes(String s) throws IOException {
if (s == null) return new byte[0];
return s.getBytes("UTF-8");
}
public static byte[] toBytes(int x) {
byte[] b = new byte[4]; //int 占 4个字节,所以使用长度为4 的字节数组
b[3] = (byte) (x & 0xFF);
b[2] = (byte) ((x >> 8) & 0xFF);
b[1] = (byte) ((x >> 16) & 0xFF);
b[0] = (byte) ((x >> 24) & 0xFF);
return b;
}
public static byte[] toBytes(long x) {
byte[] b = new byte[8];
for (int i = 7; i >= 0; i--) {
int j = (7 - i) << 3;
b[i] = (byte) ((x >> j) & 0xFF);
}
return b;
}
//转为十六进制
public static String toHex(byte[] buf){
return toHex(buf, 0, buf.length);
}
//toHex-转16进制方法
public static String toHex(byte[] buf, int offset, int len) {
StringBuilder sb = new StringBuilder();
for (int i = offset; i < offset + len; i++) {
int x = buf[i];
if (x > 32 && x < 127) {
sb.append((char) x);
} else {
sb.append("\\x").append(HEX_TMP.charAt((x >> 4) & 0x0F)).append(HEX_TMP.charAt(x & 0x0F));
}
}
return sb.toString();
}
public static byte[] toBytes(byte[] a, byte[] b) {
if (a == null) return b;
if (b == null) return a;
byte[] result = new byte[a.length + b.length];
System.arraycopy(a, 0, result, 0, a.length);
System.arraycopy(b, 0, result, a.length, b.length);
return result;
}
public static int toInt(byte[] a) {
return (a[0] << 24) & 0xFF000000 | (a[1] << 16) & 0x00FF0000 | (a[2] << 8) & 0x0000FF00
| (a[3] << 0) & 0x000000FF;
}
public static long toLong(byte[] a) {
long x = 0;
for (int i = 0; i < 8; i++) {
int j = (7 - i) << 3;
x |= ((0xFFL << j) & ((long) a[i] << j));
}
return x;
}
public static byte[] slice(byte[] buf, int offset, int len) throws IOException {
if (buf == null) {
throw new IOException("buffer is null");
}
if (offset < 0 || len < 0) {
throw new IOException("Invalid offset: " + offset + " or len: " + len);
}
if (offset + len > buf.length) {
throw new IOException("Buffer overflow, offset: " + offset + ", len: " + len
+ ", buf.length:" + buf.length);
}
byte[] result = new byte[len];
System.arraycopy(buf, offset, result, 0, len);
return result;
}
public static int hash(byte[] key) {
if (key == null) return 0;
int h = 1;
for (int i = 0; i < key.length; i++) {
h = (h << 5) + h + key[i];
}
return h;
}
//compare比较方法
public static int compare(byte[] a, byte[] b) {
if (a == b) return 0;
if (a == null) return -1;
if (b == null) return 1;
for (int i = 0, j = 0; i < a.length && j < b.length; i++, j++) {
int x = a[i] & 0xFF;
int y = b[i] & 0xFF;
if (x != y) {
return x - y;
}
}
return a.length - b.length;
}
}
package org.apache.minibase;
/**
* 布隆过滤器,查看某个元素是否在集合中,这种场景可以使用布隆过滤器
*/
public class BloomFilter {
//k--次数,做hash的次数
private int k;
//表示每个Key占用的二进制bit数, 若有x个key, 则N= x*bitsPerKey
private int bitsPerKey;
private int bitLen;
private byte[] result;
public BloomFilter(int k, int bitsPerKey) {//参数: k-hash的次数, bitsPerKey--每个key占用的二进制bit数
this.k = k;
this.bitsPerKey = bitsPerKey;
}
/**
* 生成一个byte数组
* @param keys
* @return
*/
public byte[] generate(byte[][] keys) {
assert keys != null;
bitLen = keys.length * bitsPerKey;
bitLen = ((bitLen + 7) / 8) << 3; // align the bitLen. //变为8的倍数
bitLen = bitLen < 64 ? 64 : bitLen; //最小64位,如果大于64,就使用当前的长度
result = new byte[bitLen >> 3];
for (int i = 0; i < keys.length; i++) {//进行遍历,生成hash
assert keys[i] != null;
int h = Bytes.hash(keys[i]); //进行hash算法
for (int t = 0; t < k; t++) { //k是hash的次数--这里进行3次hash
int idx = (h % bitLen + bitLen) % bitLen;
result[idx / 8] |= (1 << (idx % 8)); //3次hash,每次都将结果记录在result数组中,相当于存储三份数据?
int delta = (h >> 17) | (h << 15);
h += delta;
}
}
return result;
}
/**
* 判断是否包含在其中
* @param key
* @return
*/
public boolean contains(byte[] key) {
assert result != null;
int h = Bytes.hash(key);
for (int t = 0; t < k; t++) {
int idx = (h % bitLen + bitLen) % bitLen;
if ((result[idx / 8] & (1 << (idx % 8))) == 0) { //
return false;
}
int delta = (h >> 17) | (h << 15);
h += delta;
}
return true; //存储三份,如果都存在,说明真正的存在
}
}
package org.apache.minibase;
import org.junit.Assert;
import org.junit.Test;
import java.io.IOException;
public class TestBloomFilter {
@Test
public void testBloomFilter() throws IOException {
String[] keys = { "hello world", "hi", "bloom", "filter", "key", "value", "1", "value" };
BloomFilter bf = new BloomFilter(3, 10);
byte[][] keyBytes = new byte[keys.length][];
for (int i = 0; i < keys.length; i++) {
keyBytes[i] = keys[i].getBytes();
}
/**
* 首先生成二维的byt[][]数组
* 生成byte[]数组 generate方法
*/
bf.generate(keyBytes);
/**
* assertTrue: 断言为true
* assertFalse: 断言为false
*/
Assert.assertTrue(bf.contains(Bytes.toBytes("hi"))); //使用contains()进行判断字符串是否存在于当前的布隆过滤器中
Assert.assertFalse(bf.contains(Bytes.toBytes("h")));
Assert.assertFalse(bf.contains(Bytes.toBytes("he")));
Assert.assertTrue(bf.contains(Bytes.toBytes("hello world")));
Assert.assertTrue(bf.contains(Bytes.toBytes("bloom")));
Assert.assertTrue(bf.contains(Bytes.toBytes("key")));
}
}