simhash算法分为5个步骤:分词、hash、加权、合并、降维,具体过程如下所述:
计算差异度:
将两文本的simhash进行异或,生成的simhash中1的位数极为差异度。
simhash存储:以64位simhash为例。
最大个数2^64;平均查询次数2^63
最大占用内存:2^64*(64*2+12)+24 12为char[]对象头 ,24为 8个字节的String字节头、4个字节的char数组引用,3个int变量(offset、hash、count)
采用索引 hashmap>,如下图
平局查询 常数 O(1)
最大占用内存 4*2^16(16*2+12)+24;
注意:索引的个数一般为差异度+1(这样能确保至少有一个索引相同)
simhash%索引个数==0;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.HashMultimap;
import java.math.BigInteger;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
public class SimHash {
private String tokens;
private BigInteger intSimHash;
private String strSimHash;
private static volatile HashMultimap characters = HashMultimap.create();
private int hashbits = 64;
public SimHash(String tokens) {
this.tokens = tokens;
this.intSimHash = this.simHash();
}
public SimHash(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.intSimHash = this.simHash();
}
public BigInteger simHash() {
int[] v = new int[this.hashbits];
// 分词算法按需选择
char[] chars = this.tokens.toCharArray();
for (char ch : chars) {
BigInteger t = this.hash(String.valueOf(ch));
BigInteger bitmask = new BigInteger("1");
// 加权合并
for (int i = 0; i < this.hashbits; i++) {
bitmask = bitmask.shiftLeft(i);
if (t.and(bitmask).signum() != 0) {
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
BigInteger fingerprint = new BigInteger("0");
StringBuffer simHashBuffer = new StringBuffer();
for (int i = 0; i < this.hashbits; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
simHashBuffer.append("1");
} else {
simHashBuffer.append("0");
}
}
this.strSimHash = simHashBuffer.toString();
return fingerprint;
}
private BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
/**
* 取两个二进制的异或,统计为1的个数,就是海明距离
*
* @param other
* @return
*/
public int hammingDistance(SimHash other) {
BigInteger x = this.intSimHash.xor(other.intSimHash);
int tot = 0;
BigInteger bigInteger = new BigInteger("1");
// 统计x中二进制位数为1的个数
while (x.signum() != 0) {
tot +=x.and(bigInteger).signum();
x = x.shiftRight(1);
}
return tot;
}
/**
* calculate Hamming Distance between two strings
* 二进制怕有错,当成字符串,作一个,比较下结果
*
* @param str1
* the 1st string
* @param str2
* the 2nd string
* @return Hamming Distance between str1 and str2
* @author
*/
public static int getDistance(String str1, String str2) {
int distance;
if (str1.length() != str2.length()) {
distance = -1;
} else {
distance = 0;
for (int i = 0; i < str1.length(); i++) {
if (str1.charAt(i) != str2.charAt(i)) {
distance++;
}
}
}
return distance;
}
/**
* 如果海明距离取3,则分成四块,并得到每一块的simhash值 ,作为索引值使用
*
* @param simHash
* @param numEach=(hashbits/海明距离+1)。
* @return
*/
// hashbits%索引个数==0
public void storIndex(SimHash simHash, int numEach) {
int index = 0;
for (int i = 0; i < simHash.hashbits; i = i + numEach) {
characters.put(index++, simHash.strSimHash.substring(i, i + numEach));
}
}
public static String createObject(String key) {
System.out.println("createObject");
return key + "1";
}
public static void main(String[] args) throws ExecutionException {
// LoadingCache cahceBuilder= CacheBuilder
// .newBuilder()
// .maximumSize(10)
// .expireAfterAccess(100, TimeUnit.SECONDS)//过期时间
// .expireAfterWrite(11,TimeUnit.SECONDS)//给定时间内没有写访问
// .refreshAfterWrite(1,TimeUnit.SECONDS)// 给定时间没有读访问。
// .build(new CacheLoader(){
// @Override
// public String load(String key) throws Exception {
// return createObject(key);
// }
//
// });
// cahceBuilder.put("1","1");
// cahceBuilder.refresh("1");
// System.out.println(cahceBuilder.get("1"));
String s = "哈哈哈";
SimHash hash1 = new SimHash(s, 64);
System.out.println(hash1.strSimHash);
String s1 = "我是哈哈,";
SimHash hash = new SimHash(s1, 64);
System.out.println(hash.strSimHash);
System.out.println(hash.hammingDistance(hash1));
hash.storIndex(hash,64/4);
hash1.storIndex(hash1,64/4);
System.out.println(characters);
}
}