1. 布隆过滤器
1.1 布隆过滤器设计思想
布隆过滤器(Bloom Filter,下文简称BF)是专门用来检测集合中是否存在特定元素的数据结构。它是由长度为m比特的位数组和k个哈希函数组成的数据结构。位数组均初始化为0,哈希函数可以将输入数据尽量的均匀散列。
- 当插入一个元素时,将元素数据分别输入到k个哈希函数,产生k个哈希值。以k个哈希值作为位数组的下标,将其值置为1.
- 当查询一个元素是否存在,将元素映射为k个哈希值,判断数组中各个哈希值对应值是否为1,若均为1,那么表示该元素很可能在集合中。
存在假阳性(将不在集合中的元素误判为在集合中),不存在假阴性(将在集合中的元素误判为不在集合中)
为什么不是一定在集合中?
因为一个比特位被置为1有可能会受到其他元素的影响,产生“误差率”的情况。
1.2 布隆过滤器优缺点
布隆过滤器优点:
- 不需要存储数据本身,只使用比特表示,因此空间占用少;
- 时间效率高,插入和查询的时间复杂度为O(k)。k为哈希值;
- 哈希函数之间可以相互独立,可以在硬件指令层次并行计算;
布隆过滤器缺点:
- 存在误差率,不适合任何要求100%准确性的场景;
- 只能插入和查询元素,不能删除元素。
所以布隆过滤器适合查询准确度要求没那么苛刻,但是对时间、空间效率要求比较高的场景。
2. 布隆过滤器的实现
2.1 Guava中的实现
引入依赖
com.google.guava
guava
27.0.1-jre
使用方式
public class TestRedisBloomFilter {
public static void main(String[] args) {
//创建布隆过滤器
BloomFilter bloomFilter = BloomFilter.create(
//Funnel接口实现类的实例,它用于将任意类型T的输入数据转化为Java基本类型的数据(byte、int、char等等)。这里是会转化为byte。
Funnels.stringFunnel(Charset.forName("utf-8")),
//期望插入元素总个数n
1000,
//误差率p
0.01);
//填充数据
bloomFilter.put("112");
bloomFilter.put("113");
bloomFilter.put("114");
//判断元素是否存在
System.out.println(bloomFilter.mightContain("114"));
System.out.println(bloomFilter.mightContain("111"));
}
}
源码分析
- 生成布隆过滤器
//默认使用的策略BloomFilterStrategies.MURMUR128_MITZ_64
@VisibleForTesting
static BloomFilter create(
Funnel super T> funnel, long expectedInsertions, double fpp, Strategy strategy) {
//对参数的校验
checkNotNull(funnel);
checkArgument(
expectedInsertions >= 0, "Expected insertions (%s) must be >= 0", expectedInsertions);
checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp);
checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp);
checkNotNull(strategy);
if (expectedInsertions == 0) {
expectedInsertions = 1;
}
//获取到位数组的长度m(由期望插入元素个数&误差率确定)
long numBits = optimalNumOfBits(expectedInsertions, fpp);
//获取到哈希函数的个数k(由期望插入元素的个数&位数组长度m确定)
int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
try {
return new BloomFilter(new LockFreeBitArray(numBits), numHashFunctions, funnel, strategy);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e);
}
}
- 获取到位数组长度m的源码
@VisibleForTesting
static long optimalNumOfBits(long n, double p) {
if (p == 0) {
p = Double.MIN_VALUE;
}
return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
- 获取到哈希函数的个数
@VisibleForTesting
static int optimalNumOfHashFunctions(long n, long m) {
// (m / n) * log(2), but avoid truncation due to division!
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
- BloomFilterStrategies.MURMUR128_MITZ_64策略进行处理
MURMUR128_MITZ_64() {
@Override
public boolean put(
T object, Funnel super T> funnel, int numHashFunctions, LockFreeBitArray bits) {
long bitSize = bits.bitSize();
byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
long hash1 = lowerEight(bytes);
long hash2 = upperEight(bytes);
boolean bitsChanged = false;
long combinedHash = hash1;
for (int i = 0; i < numHashFunctions; i++) {
// Make the combined hash positive and indexable
bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
combinedHash += hash2;
}
return bitsChanged;
}
@Override
public boolean mightContain(
T object, Funnel super T> funnel, int numHashFunctions, LockFreeBitArray bits) {
//位数组的长度m
long bitSize = bits.bitSize();
//元素转换为字节数组
byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
long hash1 = lowerEight(bytes);
long hash2 = upperEight(bytes);
long combinedHash = hash1;
for (int i = 0; i < numHashFunctions; i++) {
// combinedHash & Long.MAX_VALUE) % bitSize来生成哈希值
if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
return false;
}
combinedHash += hash2;
}
return true;
}
private /* static */ long lowerEight(byte[] bytes) {
return Longs.fromBytes(
bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
}
private /* static */ long upperEight(byte[] bytes) {
return Longs.fromBytes(
bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]);
}
};
2.2 Redis的bitmap实现
参考Guava算法,为元素生成k个哈希值。存储到Redis的bitmap结构中。
使用Pipelined管道批量的操作Redis的命令
import com.google.common.hash.Funnels;
import com.google.common.hash.Hashing;
import com.google.common.primitives.Longs;
import com.tellme.utils.SpringUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.dao.DataAccessException;
import org.springframework.data.redis.connection.RedisConnection;
import org.springframework.data.redis.core.RedisCallback;
import org.springframework.data.redis.core.StringRedisTemplate;
import java.nio.charset.Charset;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
/**
* 工具类:布隆过滤器
*/
@Slf4j
public class RedisBloomFilter {
/**
* 获取到Spring容器的stringRedisTemplate
*/
private StringRedisTemplate stringRedisTemplate= SpringUtil.getBean(StringRedisTemplate.class);
/**
* 保存到Redis的key的前缀
*/
private static final String BF_KEY_PREFIX = "bf:";
/**
* 预计元素的数量n
*/
private int numApproxElements;
/**
* 误差率p
*/
private double fpp;
/**
* 哈希函数的个数k
*/
private int numHashFunctions;
/**
* 位数组的长度m
*/
private int bitmapLength;
/**
* 构造布隆过滤器。注意:在同一业务场景下,三个参数务必相同
*
* @param numApproxElements 预估元素数量
* @param fpp 可接受的最大误差(假阳性率)
*/
public RedisBloomFilter(int numApproxElements, double fpp) {
//获取预估数量n
this.numApproxElements = numApproxElements;
//获取误差率p
this.fpp = fpp;
//获取到位数组长度m
bitmapLength = (int) (-numApproxElements * Math.log(fpp) / (Math.log(2) * Math.log(2)));
//获取哈希函数个数k
numHashFunctions = Math.max(1, (int) Math.round((double) bitmapLength / numApproxElements * Math.log(2)));
}
/**
* 取得自动计算的最优哈希函数个数
*/
public int getNumHashFunctions() {
return numHashFunctions;
}
/**
* 取得自动计算的最优Bitmap长度
*/
public int getBitmapLength() {
return bitmapLength;
}
public int getNumApproxElements() {
return numApproxElements;
}
public double getFpp() {
return fpp;
}
/**
* 计算一个元素值哈希后映射到Bitmap的哪些bit上。
*
* @param element 元素值
* @return bit下标的数组
*/
private long[] getBitIndices(String element) {
long[] indices = new long[numHashFunctions];
byte[] bytes = Hashing.murmur3_128()
.hashObject(element, Funnels.stringFunnel(Charset.forName("UTF-8")))
.asBytes();
long lowerHash = Longs.fromBytes(
bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]
);
long upperHash = Longs.fromBytes(
bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]
);
long combinedHash = lowerHash;
for (int i = 0; i < numHashFunctions; i++) {
indices[i] = (combinedHash & Long.MAX_VALUE) % bitmapLength;
combinedHash += upperHash;
}
return indices;
}
/**
* 插入元素
*
* @param key 原始Redis键,会自动加上'bf:'前缀
* @param element 元素值,字符串类型
* @param expireDate 失效时间,在expireDate时间失效
*/
public void insert(String key, String element, Date expireDate) {
if (key == null || element == null) {
throw new RuntimeException("键值均不能为空");
}
String actualKey = BF_KEY_PREFIX.concat(key);
long[] bitIndices = getBitIndices(element);
stringRedisTemplate.executePipelined(new RedisCallback() {
@Override
public Object doInRedis(RedisConnection connection) throws DataAccessException {
for (int i = 0; i < bitIndices.length; i++) {
long index = bitIndices[i];
connection.setBit(actualKey.getBytes(), index, true);
}
return null;
}
});
//设置失效时间
stringRedisTemplate.expireAt(actualKey,expireDate);
}
/**
* 获取当天23点59分59秒毫秒数
*
* @return
*/
public static Date getTwelveTime() {
Calendar calendar = Calendar.getInstance();
calendar.set(calendar.get(Calendar.YEAR),
calendar.get(Calendar.MONTH),
calendar.get(Calendar.DAY_OF_MONTH),
23,
59,
59);
return calendar.getTime();
}
/**
* 检查元素在集合中是否(可能)存在
*
* @param key 原始Redis键,会自动加上'bf:'前缀
* @param element 元素值,字符串类型
*/
public boolean mayExist(String key, String element) {
if (key == null || element == null) {
throw new RuntimeException("键值均不能为空");
}
String actualKey = BF_KEY_PREFIX.concat(key);
long[] bitIndices = getBitIndices(element);
List list = stringRedisTemplate.executePipelined(new RedisCallback() {
@Override
public Boolean doInRedis(RedisConnection connection) throws DataAccessException {
for (int i = 0; i < bitIndices.length; i++) {
long index = bitIndices[i];
connection.getBit(actualKey.getBytes(), index);
}
return null;
}
});
return !list.contains(Boolean.valueOf(false));
}
}
测试方法:
@RestController
public class TestRedisBloomFilter {
@RequestMapping("/bloom")
public void insertUserT() {
//大概3百万数据,误差率在10%作用。
RedisBloomFilter redisBloomFilter = new RedisBloomFilter(3000000, 0.1);
redisBloomFilter.insert("topic_read:20200812", "76930242", RedisBloomFilter.getTwelveTime());
redisBloomFilter.insert("topic_read:20200812", "76930243", RedisBloomFilter.getTwelveTime());
redisBloomFilter.insert("topic_read:20200812", "76930244", RedisBloomFilter.getTwelveTime());
redisBloomFilter.insert("topic_read:20200812", "76930245", RedisBloomFilter.getTwelveTime());
redisBloomFilter.insert("topic_read:20200812", "76930246", RedisBloomFilter.getTwelveTime());
System.out.println(redisBloomFilter.mayExist("topic_read:20200812", "76930242"));
System.out.println(redisBloomFilter.mayExist("topic_read:20200812", "76930244"));
System.out.println(redisBloomFilter.mayExist("topic_read:20200812", "76930246"));
System.out.println(redisBloomFilter.mayExist("topic_read:20200812", "76930248"));
}
}