SimHash 计算

SimHash 计算_第1张图片

#!/usr/bin/python
# coding=utf-8

class simhash:

    #构造函数
    def __init__(self, tokens='', hashbits=128):        
        self.hashbits = hashbits
        self.hash = self.simhash(tokens);

    #toString函数 
    def __str__(self):
        return str(self.hash)

    #生成simhash值 
    def simhash(self, tokens):
        v = [0] * self.hashbits
        for t in [self._string_hash(x) for x in tokens]: #t为token的普通hash值 
            for i in range(self.hashbits):
                '100000.....0【2进制】'
                '2^n【10进制】'
                bitmask = 1 << i
                '第i为的值是否为1'
                if t & bitmask :
                    v[i] += 1 #查看当前bit位是否为1,是的话将该位+1
                else:
                    v[i] -= 1 #否则的话,该位-1
        '数组v保存每个词语的hash值的和(为1则+1,为0则-1)'
        '2进制转为10进制'
        fingerprint = 0
        for i in range(self.hashbits):
            if v[i] >= 0:
                fingerprint += 1 << i
        return fingerprint #整个文档的fingerprint为最终各个位>=0的和

    #求海明距离
    def hamming_distance(self, other):
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
        tot = 0;
        while x :
            tot += 1
            x &= x - 1
        return tot

    #求相似度
    def similarity (self, other):
        a = float(self.hash)
        b = float(other.hash)
        if a > b : return b / a
        else: return a / b

    #针对source生成hash值 (一个可变长度版本的Python的内置散列)
    #hash函数
    def _string_hash(self, source):        
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** self.hashbits - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            return x             

if __name__ == '__main__':
    s = 'This is a test string for testing'
    hash1 = simhash(s.split())

    # s = 'This is a test string for testing also'
    # hash2 = simhash(s.split())

    # s = 'nai nai ge xiong cao'
    # hash3 = simhash(s.split())

    # print(hash1.hamming_distance(hash2) , "-" , hash1.similarity(hash2))
    # print(hash1.hamming_distance(hash3) , "-" , hash1.similarity(hash3))

你可能感兴趣的:(SimHash 计算)