他们两者都在些搜索、匹配、找相关性的时候会用到。
torch.topk(input, k, dim=None, largest=True, sorted=True, *, out=None)
paddle.topk(x, k, axis=None, largest=True, sorted=True, name=None)
Tensor,
支持的数据类型 float32、float64、int32、int64。,
默认值为 -1。import torch #paddle 适用须调整参数名
data_1 = torch.tensor([1,4,5,7]) # paddle.to_tensor([1, 4, 5, 7])
value_1 , indices_1 = torch.topk(data_1, k=1) # 找top1 #paddle.topk(data_1, k=1)
print("value_1:", value_1) # [7]
print("indices_1:", indices_1) # [3]
data_2 = torch.tensor([[1,4,5,7],[2,6,2,5]])
value_2 , indices_2 = torch.topk(data_2, k=1) # 找top1
print("value_2:", value_2) # [[7], [6]]
print("indices_2:", indices_2) # [[3], [1]]
value_3 , indices_3 = torch.topk(data_2, k=1, axis=-1) # 找top1 ,沿着 -1-》dim 1
print("value_3:", value_3) # [[7], [6]]
print("indices_3:", indices_3) # [[3], [1]]
value_4 , indices_4 = torch.topk(data_2, k=1, axis=0) # 找top1 ,沿着 0 -》dim 0
print("value_4:", value_4) # [[2, 6, 5, 7]] # dim=0 这里即 取出 所有行中 每列最大的值
print("indices_4:", indices_4) # [[1, 1, 0, 0]] # dim=0 这里即 取出 所有行中 每列最大的值对应的行标
numpy.argpartition(a, kth, axis=- 1, kind='introselect', order=None)
import numpy as np
# 取最小值
data = np.array([232, 564, 278, 3, 2, 1, -1, -10, -30, -40])
out_index = np.argpartition(data, kth = 4) # 默认axis=-1,把最小的 4个往前排(顺序无所谓),其他值随意排
print(out_index) # [7 8 9 6 5 4 1 3 2 0]
print(data[out_index[:4]]) # 显示 最小的4个值
data2 = np.take_along_axis(data, out_index, axis=-1) # 按 out_index 排序后的数组
print(data2)
# 取最大值
out_index = np.argpartition(data, kth = -4 ,axis=-1) # 把最大的 4个往后排(顺序无所谓),其他值随意排
print(out_index) # [5 9 8 7 6 4 3 0 2 1]
print(data[out_index[-4:]]) # 显示 最小的4个值
data2 = np.take_along_axis(data, out_index, axis=-1) # 按 out_index 排序后的数组
print(data2)
疾病
,有 13426 个症状。
疾病
和症状
的权重matrix
,矩阵的 shape 为425 x 13426。
需要解决的问题: 有一个未知疾病
,需要查找最相近症状
的5种疾病
,并显示出来。
ps:好处是可以一次性将所有的 topk 找出来,大大的提高了计算效率 。
numpy : argpartition
import numpy as np
def get_similar_tokens(emb_dict, query_embed, k):
M = emb_dict
X = query_embed
cos = np.tensordot(M, X, axes=1) / (np.linalg.norm(M, axis=1) * np.linalg.norm(X))
# cos = np.dot(M, X) / (np.linalg.norm(M, axis=1) * np.linalg.norm(X))
indices = np.argpartition(cos, -k)[-k:]
indices = indices[np.argsort(-cos[indices])]
for i in indices:
print("疾病index: %s , 相似度: %.3f " % (i, cos[i]))
if __name__ == '__main__':
np.random.seed(0)
disease_dict = np.random.rand(425, 13426) # 疾病和症状的权重 [425 * 13426]
un_disease = np.random.rand(13426) # 待查询的疾病
get_similar_tokens(disease_dict,un_disease,k=5)
torch / paddle : topk
import torch
def get_similar(emb_dict, query_embed, k):
M = emb_dict
X = query_embed
cos = torch.tensordot(M, X, dims=1) / (torch.linalg.norm(M, dim=1) * torch.linalg.norm(X))
# cos = torch.tensordot(M, X, dims=1) / (torch.sum(M * M ,dim=1) * torch.sum(X * X) + 1e-9).sqrt()
_, topk = torch.topk(cos, k)
topk = topk.numpy()
for i in topk:
print("疾病index: %s , 相似度: %.3f " % (i, cos[i]))
if __name__ == '__main__':
disease_dict = torch.rand(size=(425, 13426)) # 疾病和症状的权重 [425 * 13426]
un_disease = torch.rand(size=(13426,)) # 待查询的疾病
get_similar(disease_dict,un_disease,k=5)
####################################################
import paddle
def get_similar(emb_dict, query_embed, k):
M = emb_dict
X = query_embed
cos = paddle.tensordot(M, X, axes=1) / (paddle.linalg.norm(M, axis=1) * paddle.linalg.norm(X))
# cos = paddle.tensordot(M, X, axes=1) / (paddle.sum(M * M ,axis=1) * paddle.sum(X * X) + 1e-9).sqrt()
_, topk = paddle.topk(cos, k)
topk = topk.numpy()
for i in topk:
print("疾病index: %s , 相似度: %.3f " % (i, cos[i]))
if __name__ == '__main__':
disease_dict = paddle.rand(shape=(425, 13426)) # 疾病和症状的权重 [425 * 13426]
un_disease = paddle.rand(shape=(13426,)) # 待查询的疾病
get_similar(disease_dict,un_disease,k=5)