simhash算法-检测两个txt文本相似性

自己实现

simash实现

# https://www.cnblogs.com/-wenli/p/11150476.html
# 使用simhash对两个文本进行相似性检测

# simhash实现
class simhash:
    def __init__(self,content):
        self.simhash=self.simhash(content)
    
    def __str__(self):
        return str(self.simhash)

    # simhash实现,也可以直接使用simhash库
    def simhash(self,content):
        # seg = jieba.cut(content)
        # jieba.analyse.set_stop_words('stopword.txt')
        # 分析具体标签
        keyWord = jieba.analyse.extract_tags(
            '|'.join(content), topK=10, withWeight=True, allowPOS=())# 在这里对jieba的tfidf.py进行了修改
        # 将tags = sorted(freq.items(), key=itemgetter(1), reverse=True)修改成tags = sorted(freq.items(), key=itemgetter(1,0), reverse=True)
        # 即先按照权重排序,再按照词排序
        keyList = []
        for feature, weight in keyWord:
            weight = int(weight * 10)
            feature = self.string_hash(feature)
            temp = []
            for i in feature:
                if(i == '1'):
                    temp.append(weight)
                else:
                    temp.append(-weight)
            # print(temp)
            keyList.append(temp)
        list1 = np.sum(np.array(keyList), axis=0)
        #print(list1)
        if(keyList==[]): #编码读不出来
            return '00'
        simhash = ''
        for i in list1:
            if(i > 0):
                simhash = simhash + '1'
            else:
                simhash = simhash + '0'
        return simhash
    # 相似度
    def similarity(self, other):
        a = float(self.simhash)
        b = float(other.simhash)
        if a > b : return b / a
        else: return a / b

    def string_hash(self,source):
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            x = bin(x).replace('0b', '').zfill(64)[-64:]
            #print(source,x)

            return str(x)
    # 计算海明距离
    def hammingDis(self, com):
        t1 = '0b' + self.simhash
        t2 = '0b' + com.simhash
        n = int(t1, 2) ^ int(t2, 2)
        i = 0
        while n:
            n &= (n - 1)
            i += 1
        return i

txt两个文本相似性检测

def get_line(fr1,fr2):
    # 停用词
    punc = './ <>_ - - = ", 。,?!“”:‘’@#¥% … &×()——+【】{};;● &~| \s:'
    stoplist = {}.fromkeys([line.rstrip() for line in
                        #    codecs.open(r"C:/Users/Administrator/Desktop/word_level/data/中文停用词库.txt", 'r', 'gbk')])
                           codecs.open(r"data/文章停用词.txt", 'r', 'utf-8')])
    # 读取文件1
    with open(fr1, encoding='utf-8') as f:
        list1 = f.read()
        string = ''
        X, Y = ['\u4e00', '\u9fa5']
        text1 = re.sub(r'[^\w]+', '', list1)
        # 分词1
        s = jieba.cut(text1)
        s = [i for i in s if len(i) > 1 and X <= i <= Y and i not in stoplist]
        string = string.join(s)
        line1 = re.sub(r"[{}]+".format(punc), "", string)
    # 读取文件2
    with open(fr2, encoding='utf-8') as f:
        list2 = f.read()
        print(re.sub(r'\n','',list2))
        string = ''
        X, Y = ['\u4e00', '\u9fa5']
        text2 = re.sub(r'[^\w]+', '', list2)
        # 分词2
        s = jieba.cut(text2)
        s = [i for i in s if len(i) > 1 and X <= i <= Y and i not in stoplist]
        string = string.join(s)
        line2 = re.sub(r"[{}]+".format(punc), "", string)
        hash1 = simhash(line1.split())
        hash2 = simhash(line2.split())
        # 计算simhash海明距离
        # print(hash1.hammingDis(hash2))
        # 如果 海明距离<=18 说明文本相似
        if hash1.hammingDis(hash2) <= 3:# 18指标的确认
            # print('文本相似')
        else:
            # print('文本不相似')

导入

# 相似性检验测试
fr1 = 'data/article/当我们对组件二次封装时我们在封装什么.txt'
fr2 = 'data/article/当我们对组件二次封装时我们在封装什么.txt'
if __name__ == '__main__':
    get_line(fr1, fr2)

你可能感兴趣的:(python,算法,python,开发语言)