计算文本矩阵相似度

Transformers库

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

train_used = model.encode(train_used, convert_to_tensor=True)
test_used = model.encode(test_used, convert_to_tensor=True)
train_unused = model.encode(train_unused, convert_to_tensor=True)
test_unused = model.encode(test_unused, convert_to_tensor=True)
similarities_used2unused = util.cos_sim(test_used, train_unused).max(dim=1).values

spacy

import spacy
nlp = spacy.load('zh_core_web_lg')
from spacy.language import Language
from spacy.tokens import Doc
import os 
import torch.nn.functional as F
import spacy
nlp = spacy.load('zh_core_web_lg')
def sentence_embed( input_list): 
    docs = list(nlp.pipe(input_list, n_process=1))
    sentence_vector = torch.Tensor([x.vector for x in docs])
    return sentence_vector
def cos_sim(test,feature):
    # dim =0 对列正则化
    test=F.normalize(test,dim=1)
    feature = F.normalize(feature,dim=1)
    out=torch.mm(test,feature.T)
    return out

# 处理文件list中的文件

test_embeded=sentence_embed(content["sentence_list"][:500])
unused_val=cos_sim( test_embeded, train_unused).max(dim=1).values
used_val=cos_sim( test_embeded, train_used).max(dim=1).values
use=torch.logical_and(unused_val<0.9408 ,used_val>0.8428)
unuse=torch.logical_not(use)
use_sen=list(np.array(content["sentence_list"][:500])[use.numpy()])
unuse_sen=list(np.array(content["sentence_list"][:500])[unuse.numpy()])
content["use_sen"]=use_sen
content["unuse_sen"]=unuse_sen

你可能感兴趣的:(矩阵,深度学习,python)