布隆过滤器(Bloom Filter)
是1970年由布隆提出的。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都比一般的算法要好的多,缺点是有一定的误识别率和删除困难。
布隆过滤器(Bloom Filter)
优缺点
优点:
布隆过滤器(Bloom Filter)
公式
误识别率公式:
p ≈ ( 1 − e − k n ‾ m ) k p \approx (1 - e^{ \underline {-kn} \atop m}) ^k p≈(1−em−kn)k
公式变换:
m = − n ln p ‾ ( ln 2 ) 2 m = - {\underline {n \ln p} \atop {(\ln 2}) ^2} m=−(ln2)2nlnp
k = n ‾ m ln 2 k = {\underline n \atop m} \ln 2 k=mnln2
p
误报率
k
哈希的次数
m
布隆过滤器的长度(如比特数组的大小)
n
是已经添加元素的数量
布隆过滤器(Bloom Filter)
应用场景
guava由谷歌公司提供,里面提供了布隆过滤器的实现。
# 添加依赖
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>30.1.1-jre</version>
</dependency>
# guava过滤器实现
public static void main(String[] args) {
BloomFilter<CharSequence> bloomFilter = BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 1000000, 0.01);
int n = 1000000;
for (int i = 0; i < n; i++) {
bloomFilter.put(String.valueOf(i));
}
int count = 0;
for (int i = 0; i < (n*2); i++) {
if (bloomFilter.mightContain(String.valueOf(i))) {
count++;
}
}
System.out.println("过滤器误判率:" + (count - n)/Double.valueOf(n));
}
# 与上述设定的误判断0.01相吻合
过滤器误判率:0.010039
Redis
实现布隆过滤器
的底层是通过bitmap
数据结构。
#添加依赖
<dependency>
<groupId>org.redisson</groupId>
<artifactId>redisson</artifactId>
<version>3.17.4</version>
</dependency>
public static void main(String[] args) {
Config config = new Config();
config.useSingleServer().setAddress("redis://127.0.0.1:26379");
config.useSingleServer().setPassword("myredis");
config.useSingleServer().setDatabase(0);
RedissonClient client = Redisson.create(config);
RBloomFilter<Object> bloomFilter = client.getBloomFilter("bloomnumber");
// 初始化布隆过滤器,设计预计元素数量为1000000L, 误差率为1%
int n = 1000000;
bloomFilter.tryInit(1000000L, 0.01);
for (int i = 0; i < n; i++) {
bloomFilter.add(String.valueOf(i));
}
int count = 0;
for (int i = 0; i < (n*2); i++) {
if (bloomFilter.contains(String.valueOf(i))) {
count++;
}
}
System.out.println("过滤器误判率:" + (count - n)/Double.valueOf(n));
}
# 不知是否我配置问题,redisson的误判率比预设误判率高了不少
过滤器误判率:0.023091
pom依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>30.1.1-jre</version>
</dependency>
redis配置
@Configuration
public class RedisConfig {
@Bean//定义第三方的Bean
public RedisTemplate<String, Object> redisTemplate(RedisConnectionFactory factory){
RedisTemplate<String, Object> template = new RedisTemplate<>();
template.setConnectionFactory(factory);
template.setKeySerializer(RedisSerializer.string());
//设置value的序列化方式
template.setValueSerializer(RedisSerializer.json());
//设置hash的key的序列化方式
template.setHashKeySerializer(RedisSerializer.string());
//设置hash的value的序列化方式
template.setHashValueSerializer(RedisSerializer.json());
template.afterPropertiesSet();//使上面参数生效
return template;
}
}
自定义布隆过滤器内置计算相关方法
public class CustomBloomFilterHelper<T> {
private int numHashFunctions;
private long bitSize;
private Funnel<T> funnel;
public CustomBloomFilterHelper(Funnel<T> funnel, int expectedInsertions, double fpp) {
Preconditions.checkArgument(funnel != null, "funnel不能为空");
this.funnel = funnel;
bitSize = optimalNumOfBits(expectedInsertions, fpp);
numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, bitSize);
}
/**
* 计算bit数组的长度
* m = -n * lnp / Math.pow(ln2,2)
* @param n 插入数据条数
* @param p 误判率
* @return
*/
private long optimalNumOfBits(long n, double p) {
if (p == 0.0D) {
p = 4.9E-324D;
}
return (long)((double)(-n) * Math.log(p) / (Math.log(2.0D) * Math.log(2.0D)));
}
/**
* 计算hash方法执行次数
* k = m/n * ln2
* @param n 插入数据条数
* @param m 数据位数
* @return
*/
private int optimalNumOfHashFunctions(long n, long m) {
return Math.max(1, (int)Math.round((double)m / (double)n * Math.log(2.0D)));
}
/**
* 计算经过多个函数处理之后数据的偏移数组
* @param value
* @return
*/
public List<Long> murmurHashOffset(T value) {
List<Long> offset = new ArrayList<>();
byte[] bytes = Hashing.murmur3_128().hashObject(value, funnel).asBytes();
long hash1 = lowerEight(bytes);
long hash2 = upperEight(bytes);
long combinedHash = hash1;
for (int i = 0; i < numHashFunctions; i++) {
long hash = (combinedHash & 9223372036854775807L) % bitSize;
offset.add(hash);
combinedHash += hash2;
}
return offset;
}
private long lowerEight(byte[] bytes) {
return Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
}
private long upperEight(byte[] bytes) {
return Longs.fromBytes(bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]);
}
}
Lua文件
// 添加数据
for i=1, #ARGV
do
redis.call('SETBIT',KEYS[1], ARGV[i], 1)
end
// 获取数据
local values = table.getn(ARGV)
for i=1, values
do
local value = redis.call('GETBIT', KEYS[1], ARGV[i])
if value == 0
then return 0
end
end
return 1
布隆过滤器添加及判断存在方法
@Component
public class RedisBloomFilter<T> {
@Autowired
private RedisTemplate<String, Object> redisTemplate;
public <T> void put(CustomBloomFilterHelper<T> bloomFilter, String key, T value) {
Preconditions.checkArgument(bloomFilter != null, "bloomFilter不能为空");
List<Long> offset = bloomFilter.murmurHashOffset(value);
if (CollectionUtils.isEmpty(offset)) {
return;
}
DefaultRedisScript<Boolean> redisScript = new DefaultRedisScript<>();
redisScript.setScriptSource(new ResourceScriptSource(new ClassPathResource("bloomFilterPut.lua")));
redisScript.setResultType(Boolean.class);
List<String> keys = new ArrayList<>();
keys.add(key);
redisTemplate.execute(redisScript, keys, offset.toArray());
}
public <T> void batchPut(CustomBloomFilterHelper<T> bloomFilter, String key, List<T> values) {
Preconditions.checkArgument(bloomFilter != null, "bloomFilter不能为空");
// 数据整合批量提交
List<Long> offset = new ArrayList<>();
for (T value : values) {
offset.addAll(bloomFilter.murmurHashOffset(value));
}
if (CollectionUtils.isEmpty(offset)) {
return;
}
Set<Long> set = new HashSet<>(offset);
DefaultRedisScript<Boolean> redisScript = new DefaultRedisScript<>();
redisScript.setScriptSource(new ResourceScriptSource(new ClassPathResource("bloomFilterPut.lua")));
redisScript.setResultType(Boolean.class);
List<String> keys = new ArrayList<>();
keys.add(key);
redisTemplate.execute(redisScript, keys, set.toArray());
}
public <T> boolean mightContain(CustomBloomFilterHelper<T> bloomFilter, String key, T value) {
Preconditions.checkArgument(bloomFilter != null, "bloomFilter不能为空");
List<Long> offset = bloomFilter.murmurHashOffset(value);
if (CollectionUtils.isEmpty(offset)) {
return false;
}
DefaultRedisScript<Long> redisScript = new DefaultRedisScript<>();
redisScript.setScriptSource(new ResourceScriptSource(new ClassPathResource("bloomFilterMightContain.lua")));
redisScript.setResultType(Long.class);
List<String> keys = new ArrayList<>();
keys.add(key);
Long result = redisTemplate.execute(redisScript, keys, offset.toArray());
if(result == 1){
return true;
}
return false;
}
}
测试用例
@Component
public class BloomFilterApplication implements ApplicationRunner {
private static CustomBloomFilterHelper<CharSequence> bloomFilterHelper;
@Autowired
RedisBloomFilter redisBloomFilter;
@PostConstruct
public void init() {
bloomFilterHelper = new CustomBloomFilterHelper<>(Funnels.stringFunnel(Charset.defaultCharset()), 1000000, 0.01);
}
@Override
public void run(ApplicationArguments args) throws Exception {
int j = 0;
List<String> data = new ArrayList<>();
for (int i = 0; i < 1000000; i++) {
data.add(i+"");
}
List<List<String>> lists = Lists.partition(data, 1000);
long start = System.currentTimeMillis();
for (List<String> list : lists) {
redisBloomFilter.batchPut(bloomFilterHelper, "bloom", list);
}
long end = System.currentTimeMillis();
start = System.currentTimeMillis();
for (int i = 0; i < 2000000; i++) {
boolean result = redisBloomFilter.mightContain(bloomFilterHelper, "bloom", i+"");
if (result) {
j++;
}
}
end = System.currentTimeMillis();
System.out.println("误判率:" + ((j - 1000000) /1000000.0));
}
}
// 输出误判率:0.010328