通过实验,使学生掌握向量相似度的基本实现方法。
向量空间模型是信息检索中最重要的形式化模型之一,向量相似度是对向量空间模型评分的重要依据。本实验需要编程实现向量相似度的基本算法。P78
输入:查询(如 “best car insurance”),文档(如“car insurance auto insurance”),文档总数N的值(如1000000)
,文档中每个词的文档频率df
输出:向量相似度的值
编程语言:Python
# 使用nnc.ltn方法
import math
import re
def Vector_similarity():
# 输入查询和文档
injury = input("请输入查询: ")
doc = input("请输入文档: ")
# injury = "best car insurance"
# doc = "car insurance auto insurance"
# 输入检索(参与计算)的词项,并且输入df
word_retrieve = []
df = []
while(1):
tem = input("请输入词项并按下回车,结束直接按下回车")
if tem != '':
word_retrieve.append(tem)
df.append(int(input("请输入{}的df:".format(tem))))
else:
break
# df = [5000, 50000, 10000, 1000]
# word_retrieve = ["auto", "best", "car", "insurance"]
# 输入文档总频率
N = int(input("输入文档总频率: "))
# N = 1000000
#将输入的查询和文档 injury和doc 提取英文单词并存至列表
injury_list = re.findall('[a-zA-Z0-9|-]+', injury)
doc_list = re.findall('[a-zA-Z0-9|-]+', doc)
print("查询的所有单词项为: ", injury_list)
print("文档的所有单词项为: ", doc_list)
#计算待检索词项在查询中和在文档中出现的频率并存入字典
dic_injury ={
}
dic_doc = {
}
for i in range(len(word_retrieve)):
count = 0
for j in range(len(injury_list)):
if word_retrieve[i] == injury_list[j]:
count += 1
dic_injury[word_retrieve[i]] = count
count = 0
for j in range(len(doc_list)):
if word_retrieve[i] == doc_list[j]:
count += 1
dic_doc[word_retrieve[i]] = count
print("查询中tf为:", dic_injury)
print("文档中tf为:", dic_doc)
#计算 idf
idf = [math.log(N/per_df, 10) for per_df in df]
print("查询中idf为:", idf)
W_tq = {
}
W_td = {
}
#计算余弦平方
sum_doc = 0
for i in dic_doc.values():
sum_doc += pow(i, 2)
pow2 = pow(sum_doc, 0.5)
print("pow2(即平方和开根号)值为: ", pow2)
#计算W_tq, W_td
inner_mutiply = 0 #计算内积
for i in range(len(word_retrieve)):
W_tq[word_retrieve[i]] = dic_injury[word_retrieve[i]] * idf[i]
W_td[word_retrieve[i]] = dic_doc[word_retrieve[i]] / pow2
inner_mutiply += W_tq[word_retrieve[i]] * W_td[word_retrieve[i]]
print("查询的W_tq=wf(查询)*idf值为: ", W_tq)
print("查询的W_td=wf(文档)/pow2(即平方和开根号)值为: ", W_td)
print("内积和最终结果为:\n", inner_mutiply)
if __name__ == '__main__':
Vector_similarity()