1.python(Numpy实现)
具体公式见reference中的论文。
# -*- coding: utf-8 -*-
"""
Created on Mon May 19 09:32:00 2018
@author: wangyao
"""
import jieba
#simhash值直接用包计算,pip install simhash
from simhash import Simhash
import re
import numpy as np
import pandas as pd
#停用词
stopwords = [line.strip() for line in open('Stopwords.txt', 'r', encoding='utf-8').readlines()]
#文本预处理+特征提取
def get_features(s):
width = 3
string = ''
s = re.sub(r'[^\w]+', '', s)
s = jieba.lcut(s)
X,Y = ['\u4e00','\u9fa5']
s = [ i for i in s if len(i) > 1 and X<=i<=Y and i not in stopwords]
for i in s:
string += i
if string:
return [string[i:i + width] for i in range(max(len(string) - width + 1, 1))]
else:
print ("请输入中文文档")
#list1 = df.content.apply(lambda x: isinstance(x, str))
#文本预处理
def Pre_Processing(s):
string = ''
s = re.sub(r'[^\w]+', '', s)
s = jieba.lcut(s)
X,Y = ['\u4e00','\u9fa5']
s = [i for i in s if len(i) > 1 and X<=i<=Y and i not in stopwords]
string = string.join(s)
if string:
return string
else:
print('请勿输入空字符串或者完全由停用词组成的无意义的句子')
#simhash包自带的汉明距离
def hanming_simhash(s1,s2):
hanmingdistance = Simhash(Pre_Processing(s1)).distance(Simhash(Pre_Processing(s2)))
#return hanming_distance
return 1-hanmingdistance/64
#将字符串转化为hashcode
def ToSimhashcode(s):
if type(s) == str:
return Simhash(get_features(s)).value
else:
print('输入的句子格式需要是字符串')
#自己写的汉明距离
def hanming_distance(s1,s2):
if type(s1) == str and type(s2) == str:
hanmingdistance = bin(int(hex(Simhash(get_features(s1)).value),16)^int(hex(Simhash(get_features(s2)).value),16)).count('1')
elif type(s1) == int and type(s2) == int:
hanmingdistance = bin(int(hex(s1),16)^int(hex(s2),16)).count('1')
else:
print ('s1和s2需要是相同的数据类型!')
#return hanming_distance
return 1-hanmingdistance/64
#直接填入文本列表,生成相似度矩阵(文本较少的时候可以使用此函数)
def simhash_simmatrix_doc(doc_list):
simhash_corpus = []
for i in range(len(doc_list)):
simhash_corpus.append([])
for i in range(len(doc_list)):
print (i)
for j in range(i,len(doc_list)):
simhash_corpus[j].append(hanming_distance(doc_list[i],doc_list[j]))
x = len(simhash_corpus)-1
for i in range(len(simhash_corpus)-1):
simhash_corpus[i].extend(x*[0])
x = x - 1
return np.array(simhash_corpus) + np.array(simhash_corpus).T
#填入文本的hashcode,生成相似度矩阵
def simhash_simmatrix_hashcode(hashcode_list):
simhash_corpus = []
for i in range(len(hashcode_list)):
simhash_corpus.append([])
for i in range(len(hashcode_list)):
print (i)
for j in range(i,len(hashcode_list)):
simhash_corpus[j].append(hanming_distance(hashcode_list[i],hashcode_list[j]))
x = len(simhash_corpus)-1
for i in range(len(simhash_corpus)-1):
simhash_corpus[i].extend(x*[0])
x = x - 1
return np.array(simhash_corpus) + np.array(simhash_corpus).T
#过滤文本生成的相似度矩阵,
def DuplicateContent_filtering_doc(doc_list,sim = 0.8):
if sim>1 or sim<0:
print ('错误的取值范围!!范围应该是[0,1]')
sim_matrix = simhash_simmatrix_doc(doc_list)>sim
return sim_matrix
#过滤hashcode生成的相似度矩阵
def DuplicateContent_label_hashcode(hashcode_array,sim = 0.8):
if sim>1 or sim<0:
print ('错误的取值范围!!范围应该是[0,1]')
sim_matrix = hashcode_array>sim
return sim_matrix
#列表求交集
def list_intersection(list1,list2):
list3 = list(set(list1) & set(list2))
return list3
#清洗重复的文章列表
def clean_Duplicate_List(Duplicate_List):
Duplicate_List = [list(set(i)) for i in Duplicate_List]
Duplicate = []
for i in range(len(Duplicate_List)):
temp = []
for j in range(len(Duplicate_List)):
if list_intersection(Duplicate_List[i],Duplicate_List[j]) != []:
temp.extend(Duplicate_List[i])
temp.extend(Duplicate_List[j])
Duplicate.append(temp)
Duplicate = [list(set(i)) for i in Duplicate]
NoDuplicate = []
for i in Duplicate:
if i in NoDuplicate:
pass
else:
NoDuplicate.append(i)
NoDuplicate = [list(set(i)) for i in NoDuplicate]
return NoDuplicate
#读入相似度矩阵,返回非重复的文章ID和重复的文章ID
def DuplicateContent_filtering_hashcode(matrix):
NoDuplicateList = []
DuplicateList = []
NoDuplicate_List = []
Duplicate_List = []
for i in range(len(matrix)):
DuplicateList.append([])
for i in range(len(matrix)):
NoDuplicateList.append([])
if (matrix[i] == True).sum() <= 1:
NoDuplicateList[i].append(ID[i])
else:
for j in range(len(matrix[i])):
if matrix[i][j]==True and i < j:
DuplicateList[i].extend([i,j])
for i in DuplicateList:
if i != []:
Duplicate_List.append(i)
else:
pass
for i in NoDuplicateList:
if i != []:
NoDuplicate_List.append(i)
else:
pass
return NoDuplicate_List,clean_Duplicate_List(Duplicate_List)
短文本,如果文本很短,可以直接调用simhash_simmatrix_doc,直接读入字符串即可产生结果。
df = pd.read_excel('00.xlsx')
df.content = df.content.apply(Pre_Processing)
content_list = df.content.tolist()
simhash_simmatrix_doc(content_list)
如果需要存进hive表中,需要先把矩阵转换成pandas中的dataframe再转化为spark中的dataframe。
//转化
values=pandas_df.values.tolist()
cols_name=list(pandas_df.columns)
spark_df = spark.createDataFrame(values, cols_name)
turn_pandas_df = spark_df.toPandas()
如果是是直接使用spark,两两比较可以使用reducebykey来聚合,查找的时候才需要使用索引。如果是在python中按照上面的代码两两比较会非常耗时,所以比较的时候就需要引入simhash的索引。
网上的例子:
from simhash import Simhash, SimhashIndex
# 建立索引
data = {
u'1': u'How are you I Am fine . blar blar blar blar blar Thanks .'.lower().split(),
u'2': u'How are you i am fine .'.lower().split(),
u'3': u'This is simhash test .'.lower().split(),
}
objs = [(id, Simhash(sent)) for id, sent in data.items()]
index = SimhashIndex(objs, k=10) # k是容忍度;k越大,检索出的相似文本就越多
# 检索
s1 = Simhash(u'How are you . blar blar blar blar blar Thanks'.lower().split())
print index.get_near_dups(s1)
# 增加新索引
index.add(u'4', s1)
2.scala实现
scala中没有simhash包,需要自己实现FNVHash来给文本转码。
目前用的比较多的是FNV-1和FNV-1a,FNV-0已弃用。
FNV-1:
hash = offset_basis
for each octet_of_data to be hashed
hash = hash * FNV_prime
hash = hash xor octet_of_data
return hash
FNV-1a:
hash = offset_basis
for each octet_of_data to be hashed
hash = hash xor octet_of_data
hash = hash * FNV_prime
return hash
FNV-1a算法只是将相乘和异或运算进行了顺序调换。
import java.math.BigInteger
object FNVHash {
val HASH_BITS = 64
val FNV_64_INIT = new BigInteger("14695981039346656037")
val FNV_64_PRIME = new BigInteger("1099511628211")
val MASK_64: BigInteger = BigInteger.ONE.shiftLeft(HASH_BITS).subtract(BigInteger.ONE)
val str = "电风扇卡机是可敬的搞屎棍的"
val str1 = "电风扇卡机是可敬的搞屎棍的对对对"
/**
* fnv-1 hash算法,将字符串转换为64位hash值
*
* @param str str
* @return
*/
def fnv1Hash64(str: String): BigInteger = {
var hash = FNV_64_INIT
val len = str.length
for (i <- 0 until len) {
hash = hash.multiply(FNV_64_PRIME)
hash = hash.xor(BigInteger.valueOf(str.charAt(i)))
}
hash = hash.and(MASK_64)
hash
}
/**
* fnv-1a hash算法,将字符串转换为64位hash值
*
* @param str str
* @return
*/
def fnv1aHash64(str: String): BigInteger = {
var hash = FNV_64_INIT
val len = str.length
var i = 0
for (i <- 0 until len) {
hash = hash.xor(BigInteger.valueOf(str.charAt(i)))
hash = hash.multiply(FNV_64_PRIME)
}
hash = hash.and(MASK_64)
hash
}
/**
* 返回二进制串hash距离
*
* @param str1 str1
* @param str2 str2
* @return
*/
def getDistance(str1: String, str2: String): Int = {
var distance = 0
if (str1.length != str2.length) distance = -1
else {
distance = 0
val len = str1.length
for (i <- 0 until len) {
if (str1.charAt(i) != str2.charAt(i)) distance += 1
}
}
distance
}
def hashContent_d(content: Array[String]): BigInteger = {
val featureVector: Array[Double] = new Array[Double](FNVHash.HASH_BITS)
val wordInfos = content.map((_, 1)).groupBy(_._1).
map(r => (r._1, r._2.length.toDouble)).
map(r => {
val wordHash = FNVHash.fnv1aHash64(r._1)
for (i <- 0 until FNVHash.HASH_BITS) {
val bitmask: BigInteger = BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1)
if (wordHash.and(bitmask).signum != 0)
featureVector(i) += r._2.toDouble
else
featureVector(i) -= r._2.toDouble
}
})
var signature: BigInteger = BigInteger.ZERO
for (i <- 0 until FNVHash.HASH_BITS) {
if (featureVector(i) >= 0) {
signature = signature.add(BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1))
}
}
signature
}
def hashContent(content: Array[String]): BigInteger = {
val featureVector: Array[Double] = new Array[Double](HASH_BITS)
val wordInfos = content.map((_, 1)).groupBy(_._1).
map(r => (r._1, r._2.length.toDouble)).
map(r => {
val wordHash = fnv1aHash64(r._1)
for (i <- 0 until HASH_BITS) {
val bitmask: BigInteger = BigInteger.ONE.shiftLeft(HASH_BITS - i - 1)
if (wordHash.and(bitmask).signum != 0)
featureVector(i) += r._2.toDouble
else
featureVector(i) -= r._2.toDouble
}
})
var signature: BigInteger = BigInteger.ZERO
for (i <- 0 until HASH_BITS) {
if (featureVector(i) >= 0) {
signature = signature.add(BigInteger.ONE.shiftLeft(HASH_BITS - i - 1))
}
}
signature
}
def hashContentStr(content: String): BigInteger = {
hashContent(content.split(" "))
}
// 判断汉明距离是否大于kBits
def distance(hash1: Long, hash2: Long, kBits: Int): Boolean = {
var n = hash1 ^ hash2
for (i <- 0 until kBits if n > 0) {
n = n & (n - 1)
}
n == 0
}
}
Reference:
《Similarity estimation techniques from rounding algorithms》
https://www.cnblogs.com/maybe2030/p/5203186.html#_label4