相关介绍
- TF-IDF 算法
如果某个词在 给定文档 中很少出现,但是在给定文档中的 某一篇文章 中出现的次数很大,
该词在很大程度上反映了 该文章 的特性,我们称该词为这篇文章的关键字
参考链接:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
- 余弦相似性
参考链接:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
测试案例
- 代码部分
import jieba.posseg as pseg
from gensim import corpora, models, similarities
class Tfidf:
""" TF-IDF模型比较文本相似度类 """
stop_words = ['酒店', '旅馆']
stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']
def __init__(self):
pass
"""
def ensure_stop_words(self):
# 停用词
if self.stop_words is None:
stop_file = PATH_DOC + 'StopWords.txt'
stop_words = codecs.open(stop_file, 'r', encoding='utf8').readlines()
self.stop_words = [w.strip() for w in stop_words]
"""
def text2words(self, text: str) -> list:
""" 对一段文本分词、去停用词 """
result = []
words = pseg.cut(text)
for word, flag in words:
if word not in self.stop_words and flag not in self.stop_flag:
result.append(word)
return result
def similarity_compare(self, compare_doc: str, refer_doc: list) -> tuple:
"""
比较相似度
:param compare_doc: 待比对的文档
:param refer_doc: 基准文档
:return: tuple
"""
refer_words = []
placeholder_count = 0
for refer_word in refer_doc:
words = self.text2words(refer_word)
if words:
refer_words.append(words)
else:
placeholder_count += 1
refer_words.append(self.text2words('placeholder' + str(placeholder_count)))
dictionary = corpora.Dictionary(refer_words)
doc_vectors = [dictionary.doc2bow(word) for word in refer_words]
tf_idf = models.TfidfModel(doc_vectors)
tf_idf_vectors = tf_idf[doc_vectors]
compare_vectors = dictionary.doc2bow(self.text2words(compare_doc))
index = similarities.MatrixSimilarity(tf_idf_vectors, num_features=len(dictionary))
sims = index[compare_vectors]
sims = sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True)
"""
index = similarities.MatrixSimilarity(tf_idf_vectors, num_features=len(dictionary), num_best=1)
# 对结果按相似度由高到低排序
sims = index[compare_vectors]
"""
return sims[0]
if __name__ == '__main__':
tfIdf = Tfidf()
test = '月亮海滩旅馆'
refers = {
'普吉岛断点酒店': [(1, '普吉岛断点酒店')],
'月亮海滩酒店': [(10386, '月亮海滩酒店')],
'月亮海酒店': [(1564, '月亮海酒店')],
'清莱海滩酒店': [(3467, '清莱艾美度假酒店')]
}
titles = list(refers.keys())
similarity = tfIdf.similarity_compare(test, titles)
msg = "测试酒店 '%s' 和参照酒店中的 '%s' 最相似,相似度为 %f,对应酒店ID为:%s" \
% (test, titles[similarity[0]], similarity[1],
refers[titles[similarity[0]]][0][0])
print(msg)
- 结果展示