布隆过滤器

文章目录

  • 布隆过滤器
    • 简介
      • 适用场景
      • 布隆过滤器原理
        • 举例
    • code
      • Bytes
      • BloomFilter
      • 测试用例Test

布隆过滤器

简介

适用场景

可以高效的判断元素w是否在集合A之中

布隆过滤器原理

布隆过滤器由一个长度为N的01数组array组成,首先将数组array每个元素设置为0,对集合A中的每个元素w, 做k次哈希, 每一次hash后对N取模得到一个index(i), 即index(i) = HASH_I(W)%N, 将array数组中的array[index(i)]设置为1, 最终array变为一个某些元素为1的01数组

举例

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vHjNSR0U-1571818862738)(EA6327242322421BBB3682288D6784ED)]
{x,y,z}三个元素,k=3,所以一共存储9个index, 每个对应的index上值为1,最终数组中存储的结果就是由01元素组成

code

Bytes

package org.apache.minibase;

import java.io.IOException;


/**
 * Bytes类,比较底层的方法
 */
public class Bytes {
  //空数组
  public final static byte[] EMPTY_BYTES = new byte[0];
  //十六进制字符串
  public final static String HEX_TMP = "0123456789ABCDEF";
  //toBytes方法-将byte转为byte[]数组
  public static byte[] toBytes(byte b) {
    return new byte[] { b };
  }

  public static byte[] toBytes(String s) throws IOException {
    if (s == null) return new byte[0];
    return s.getBytes("UTF-8");
  }

  public static byte[] toBytes(int x) {
    byte[] b = new byte[4]; //int 占 4个字节,所以使用长度为4 的字节数组
    b[3] = (byte) (x & 0xFF);
    b[2] = (byte) ((x >> 8) & 0xFF);
    b[1] = (byte) ((x >> 16) & 0xFF);
    b[0] = (byte) ((x >> 24) & 0xFF);
    return b;
  }

  public static byte[] toBytes(long x) {
    byte[] b = new byte[8];
    for (int i = 7; i >= 0; i--) {
      int j = (7 - i) << 3;
      b[i] = (byte) ((x >> j) & 0xFF);
    }
    return b;
  }
  //转为十六进制
  public static String toHex(byte[] buf){
    return toHex(buf, 0, buf.length);
  }

  //toHex-转16进制方法
  public static String toHex(byte[] buf, int offset, int len) {
    StringBuilder sb = new StringBuilder();
    for (int i = offset; i < offset + len; i++) {
      int x = buf[i];
      if (x > 32 && x < 127) {
        sb.append((char) x);
      } else {
        sb.append("\\x").append(HEX_TMP.charAt((x >> 4) & 0x0F)).append(HEX_TMP.charAt(x & 0x0F));
      }
    }
    return sb.toString();
  }

  public static byte[] toBytes(byte[] a, byte[] b) {
    if (a == null) return b;
    if (b == null) return a;
    byte[] result = new byte[a.length + b.length];
    System.arraycopy(a, 0, result, 0, a.length);
    System.arraycopy(b, 0, result, a.length, b.length);
    return result;
  }

  public static int toInt(byte[] a) {
    return (a[0] << 24) & 0xFF000000 | (a[1] << 16) & 0x00FF0000 | (a[2] << 8) & 0x0000FF00
        | (a[3] << 0) & 0x000000FF;
  }

  public static long toLong(byte[] a) {
    long x = 0;
    for (int i = 0; i < 8; i++) {
      int j = (7 - i) << 3;
      x |= ((0xFFL << j) & ((long) a[i] << j));
    }
    return x;
  }

  public static byte[] slice(byte[] buf, int offset, int len) throws IOException {
    if (buf == null) {
      throw new IOException("buffer is null");
    }
    if (offset < 0 || len < 0) {
      throw new IOException("Invalid offset: " + offset + " or len: " + len);
    }
    if (offset + len > buf.length) {
      throw new IOException("Buffer overflow, offset: " + offset + ", len: " + len
          + ", buf.length:" + buf.length);
    }
    byte[] result = new byte[len];
    System.arraycopy(buf, offset, result, 0, len);
    return result;
  }

  public static int hash(byte[] key) {
    if (key == null) return 0;
    int h = 1;
    for (int i = 0; i < key.length; i++) {
      h = (h << 5) + h + key[i];
    }
    return h;
  }
  //compare比较方法
  public static int compare(byte[] a, byte[] b) {
    if (a == b) return 0;
    if (a == null) return -1;
    if (b == null) return 1;
    for (int i = 0, j = 0; i < a.length && j < b.length; i++, j++) {
      int x = a[i] & 0xFF;
      int y = b[i] & 0xFF;
      if (x != y) {
        return x - y;
      }
    }
    return a.length - b.length;
  }
}

BloomFilter

package org.apache.minibase;

/**
 * 布隆过滤器,查看某个元素是否在集合中,这种场景可以使用布隆过滤器
 */
public class BloomFilter {
  //k--次数,做hash的次数
  private int k;
  //表示每个Key占用的二进制bit数, 若有x个key, 则N= x*bitsPerKey
  private int bitsPerKey;

  private int bitLen;
  private byte[] result;

  public BloomFilter(int k,  int bitsPerKey) {//参数: k-hash的次数, bitsPerKey--每个key占用的二进制bit数
    this.k = k;
    this.bitsPerKey = bitsPerKey;
  }

  /**
   * 生成一个byte数组
   * @param keys
   * @return
   */
  public byte[] generate(byte[][] keys) {
    assert keys != null;
    bitLen = keys.length * bitsPerKey;
    bitLen = ((bitLen + 7) / 8) << 3; // align the bitLen. //变为8的倍数
    bitLen = bitLen < 64 ? 64 : bitLen; //最小64位,如果大于64,就使用当前的长度
    result = new byte[bitLen >> 3];
    for (int i = 0; i < keys.length; i++) {//进行遍历,生成hash
      assert keys[i] != null;
      int h = Bytes.hash(keys[i]); //进行hash算法
      for (int t = 0; t < k; t++) { //k是hash的次数--这里进行3次hash
        int idx = (h % bitLen + bitLen) % bitLen;
        result[idx / 8] |= (1 << (idx % 8)); //3次hash,每次都将结果记录在result数组中,相当于存储三份数据?
        int delta = (h >> 17) | (h << 15);
        h += delta;
      }
    }
    return result;
  }

  /**
   * 判断是否包含在其中
   * @param key
   * @return
   */
  public boolean contains(byte[] key) {
    assert result != null;
    int h = Bytes.hash(key);
    for (int t = 0; t < k; t++) {
      int idx = (h % bitLen + bitLen) % bitLen;
      if ((result[idx / 8] & (1 << (idx % 8))) == 0) { //
        return false;
      }
      int delta = (h >> 17) | (h << 15);
      h += delta;
    }
    return true; //存储三份,如果都存在,说明真正的存在
  }
}

测试用例Test

package org.apache.minibase;

import org.junit.Assert;
import org.junit.Test;

import java.io.IOException;

public class TestBloomFilter {

  @Test
  public void testBloomFilter() throws IOException {
    String[] keys = { "hello world", "hi", "bloom", "filter", "key", "value", "1", "value" };
    BloomFilter bf = new BloomFilter(3, 10);
    byte[][] keyBytes = new byte[keys.length][];
    for (int i = 0; i < keys.length; i++) {
      keyBytes[i] = keys[i].getBytes();
    }
    /**
     * 首先生成二维的byt[][]数组
     * 生成byte[]数组 generate方法
     */
    bf.generate(keyBytes);
    /**
     * assertTrue: 断言为true
     * assertFalse: 断言为false
     */
    Assert.assertTrue(bf.contains(Bytes.toBytes("hi"))); //使用contains()进行判断字符串是否存在于当前的布隆过滤器中
    Assert.assertFalse(bf.contains(Bytes.toBytes("h")));
    Assert.assertFalse(bf.contains(Bytes.toBytes("he")));
    Assert.assertTrue(bf.contains(Bytes.toBytes("hello world")));
    Assert.assertTrue(bf.contains(Bytes.toBytes("bloom")));
    Assert.assertTrue(bf.contains(Bytes.toBytes("key")));
  }
}

你可能感兴趣的:(布隆过滤器,bloomfilter,Java,Hbase,leetcode)