参考知乎上一二三冲鸭同学的博客,根据自己理解对他的方法再实现一遍~
"""
计算两句话的余弦相似度
源文: 使用余弦相似度算法计算文本相似度 - 一二三冲鸭的文章 - 知乎
https://zhuanlan.zhihu.com/p/43396514
"""
import math
from typing import List, Dict, AnyStr
import jieba
def separate(text_data: AnyStr) -> List:
ret = jieba.cut(text_data, cut_all=False)
return [word for word in ret if word not in [',', '。']]
def text_to_num(all_data: Dict, sep: List) -> List:
for index in range(0, len(sep)):
sep[index] = all_data[sep[index]]
return sep
def summary(sep1: List, sep2: List) -> Dict:
all_data = {}
data = list(set(sep1 + sep2))
for index in range(0, len(data)):
all_data[data[index]] = index
return all_data
def to_feq(all_data: Dict, sep: List) -> List:
temp = [0 for i in range(0, len(all_data))]
for num in sep:
temp[num] += 1
return temp
def compute_cosine_similarity(params1: List, params2: List) -> float:
# 计算余弦相似度
denominator_l = 0
denominator_r = 0
molecular = 0
for index in range(0, len(params1)):
molecular += (params1[index] * params2[index])
denominator_l += params1[index] ** 2
denominator_r += params2[index] ** 2
return molecular / (math.sqrt(denominator_l) * math.sqrt(denominator_r))
def run():
text1 = "飓风跳跃星球"
text2 = "飓风跳过星球"
seg_list1: List = separate(text1)
print(f'seg_list1分词: ' + '/'.join(seg_list1))
seg_list2: List = separate(text2)
print(f'seg_list2分词: ' + '/'.join(seg_list2))
all_data: Dict = summary(seg_list1, seg_list2)
print(f'all_list: {all_data}')
seg_list1: List = text_to_num(all_data, seg_list1)
print(f'seg_list1转为数字: {seg_list1}')
seg_list2: List = text_to_num(all_data, seg_list2)
print(f'seg_list2转为数字: {seg_list2}')
seg_list1: List = to_feq(all_data, seg_list1)
print(f'seg_list1词频统计: {seg_list1}')
seg_list2: List = to_feq(all_data, seg_list2)
print(f'seg_list2词频统计: {seg_list2}')
ret: float = compute_cosine_similarity(seg_list1, seg_list2)
print(f'余弦相似度为: {ret}')
if __name__ == '__main__':
run()
参考博客:
使用余弦相似度算法计算文本相似度 - 一二三冲鸭的文章 - 知乎
https://zhuanlan.zhihu.com/p/43396514