模型使用的是:uer/sbert-base-chinese-nli · Hugging Face
sentence_transformers 官网:SentenceTransformers Documentation — Sentence-Transformers documentation
sentence_transformer 使用自己训练数据微调模型的代码如下所示。(模型为计算句子相似度)
from sentence_transformers import SentenceTransformer,SentenceDataset,InputExample,evaluation,losses,util
class sbert():
def build_train_data(self,o1,o2,n1,n2,train_size):
train_data = []
for i in range(train_size):
train_data.append(InputExample(tests=[o1[i],o2[i]],label=1.0))
train_data.append(InputExample(tests=[n1[i],n2[i]],label=1.0))
return train_data
def build_evaluation_data(o1,o2,n1,n2,train_size,eval_size):
s1 = o1[train_size:]
s2 = o2[train_size:]
s1.extend(list(n1[train_size:]))
s2.extend(list(n2[train_size:]))
score = [1.0]*eval_size + [0.0]*eval_size
evaluator = evaluation.EmbeddingSimilarityEvaluator(s1,s2,score)
return evaluator
def callback(self,score,epoch,steps)
print('score:{},epoch:{},steps:{}'.format(score,epoch,steps))
def train(self):
#1.获取正、负样本,o1是标准问,O2是相似问
o1,o2 = self.get_act_data()
n1,n2 = self.get_neg_data()
#2.定义训练集、测试集大小 + 构造训练数据
train_size = int(len(o1)*0.8)
eval_size = len(o1) - train_size
train_data = self.build_train_data(o1,o2,n1,n2,train_size)
#3.定义测试数据
evaluator = self.build_evaluation_data(o1,o2,n1,n2,train_size,eval_size)
#4.需要训练的模型
mode = SentenceTransformer('模型地址')
#5
train_dataset = SentenceDataset(train_data,model)
train_dataloader = DataLoader(train_dataset,shuffle =true, batch_size = 8)
train_loss = losses.CosineSimilarityLoss(model)
#6.调试模型
model.fit(train_objectives = [(train_dataloader,train_loss)],epochs = 1,warmup_steps = 100,evaluator = evaluator,evaluation_steps = 100,output_path = '存调试后模型的地址',save_best_model = True,callback = self.callback)
sentence_transformer使用自己微调后的模型的代码如下所示:
#1. 定义模型
model = SentenceTransformer('模型地址')
#2.编码向量
o1_emb = model.encode(['数据list','求求一定要好运啊'])
o2_emb = model.encode(['一定要是列表','我绝对可以好运']
#计算相似度
cosine_score0 = util.cos_sim(o1_emb,o2_emb)
cosine_score = []
for i in range(len(cosine_score0)):
cosine_score.append(cosine_score0[i][i].numpy().tolist()
增加模型层数
from sentence_transformers import SentenceTransformer, models
from torch import nn
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])