




  • 介绍:布隆过滤器可以被认为是一个很长的二进制数组,可以用来代表一个大规模集合,判断某个元素是否在此该集合中,但是有一定错误率;
  • 错误率:布隆过滤器的判断是有一定几率失误的,但只可能出现某个元素不在集合中,却被判断为在集合中这种类型的错误。
  • 用途:常用于URL过滤、网页黑名单、邮件黑名单等;
  • 基本原理和运行步骤:
    ① 布隆过滤器可以看作一个长度为 m 的比特数组,初始所有位置都置为0,有 n 个相互独立的Hash函数;
    ② 针对每个集合中的元素 v ,用这 n 个相互独立的Hash函数求得 n 个 Hash值:H1、H2、…、Hn;
    ③ 对于某个求得的Hash值Hi,求:index = Hi % (m - 1),然后把比特数组中的 index 位置置为1,这样对于每个元素 v ,就能在比特数组中的最多 n 个位置置为1;
    ④ 当我们把比特数组初始化成功后,对于任何一个待查元素q,也用那 n 个Hash函数求得它的所有位置,判断这些位置是否为1,如果全为1则在集合中,否则判断为不在集合中;



import java.util.*;

 * @author kidd
public class BloomFilter implements Serializable {

    private static final long serialVersionUID = -881375780720891535L;

    private static final int[] SEEDS = {543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
            344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
            465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
            481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
            63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518};

     * 总bit位数
    private final int bitCount;

     * 比特数组
    private final BitSet bitSet;

     * 对于每个字符串生成的hash个数
    private final int hashCount;

     * 当前采用的seed,对应每个hash函数
    private final int[] seeds;

     * 实际错误率
    private final double realErrorRate;

    public BloomFilter() {
        this(100000000, 0.00000001);

    public BloomFilter(int capacity, double errorRate) {
        this.bitCount = (int) Math.ceil(capacity * Math.log(Math.E) * Math.log(1 / errorRate));
        this.hashCount = (int) Math.ceil(Math.log1p(2) * this.bitCount / capacity);
        this.realErrorRate = Math.pow((1 - Math.pow(Math.E, - (double) capacity * hashCount / bitCount)), hashCount);
        this.seeds = Arrays.copyOf(SEEDS, hashCount);
        this.bitSet = new BitSet(bitCount);

    public double getRealErrorRate () {
        return this.realErrorRate;

     * 如果不存在就进行记录并返回false,如果存在了就返回true
    public boolean addIfNotExist(String value) {
        boolean exits = true;
        for (int hash : getHashList(value)) {
            if (!bitSet.get(hash % (bitCount - 1))) {
                bitSet.set(hash % (bitCount - 1));
                exits = false;
        return exits;

    public void add(String value) {
        for (int hash : getHashList(value)) {
            bitSet.set(hash % (bitCount - 1));

    public boolean isExisted(String value) {
        for (int hash : getHashList(value)) {
            if (!bitSet.get(hash % (bitCount - 1))) {
                return false;
        return true;

    public List<Integer> getHashList(String value) {
        List<Integer> hashList = new ArrayList<>(hashCount);
        for (int seed : seeds) {
            int hash = murmurhash3(value, seed);
            if (hash < 0) {
                hash = Integer.MAX_VALUE + hash;
        return hashList;

    public int murmurhash3(CharSequence data, int seed) {

        final int c1 = 0xcc9e2d51;
        final int c2 = 0x1b873593;

        int h1 = seed;

        int pos = 0;
        int end = data.length();
        int k1 = 0;
        int k2 = 0;
        int shift = 0;
        int bits = 0;
        // length in UTF8 bytes
        int nBytes = 0;

        while (pos < end) {
            int code = data.charAt(pos++);
            if (code < 0x80) {
                k2 = code;
                bits = 8;
            else if (code < 0x800) {
                k2 = (0xC0 | (code >> 6))
                        | ((0x80 | (code & 0x3F)) << 8);
                bits = 16;
            else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
                // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
                k2 = (0xE0 | (code >> 12))
                        | ((0x80 | ((code >> 6) & 0x3F)) << 8)
                        | ((0x80 | (code & 0x3F)) << 16);
                bits = 24;
            } else {
                // surrogate pair
                // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
                int utf32 = (int) data.charAt(pos++);
                utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
                k2 = (0xff & (0xF0 | (utf32 >> 18)))
                        | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
                        | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
                        |  (0x80 | (utf32 & 0x3F)) << 24;
                bits = 32;

            k1 |= k2 << shift;

            // int used_bits = 32 - shift;  // how many bits of k2 were used in k1.
            // int unused_bits = bits - used_bits; //  (bits-(32-shift)) == bits+shift-32  == bits-newshift

            shift += bits;
            if (shift >= 32) {
                // mix after we have a complete word

                k1 *= c1;
                k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
                k1 *= c2;

                h1 ^= k1;
                h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
                h1 = h1*5+0xe6546b64;

                shift -= 32;
                // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
                if (shift != 0) {
                    k1 = k2 >>> (bits-shift);   // bits used == bits - newshift
                } else {
                    k1 = 0;
                nBytes += 4;

        } // inner

        // handle tail
        if (shift > 0) {
            nBytes += shift >> 3;
            k1 *= c1;
            k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
            k1 *= c2;
            h1 ^= k1;

        // finalization
        h1 ^= nBytes;

        // fmix(h1);
        h1 ^= h1 >>> 16;
        h1 *= 0x85ebca6b;
        h1 ^= h1 >>> 13;
        h1 *= 0xc2b2ae35;
        h1 ^= h1 >>> 16;

        return h1;

    public static void main(String[] args) {
        BloomFilter fileter = new BloomFilter();

