source = self.tokenizer.batch_encode_plus(
[source_text],
max_length=self.source_len,
pad_to_max_length=True,
truncation=True,
padding="max_length",
return_tensors="pt",
)
source_ids = source["input_ids"].squeeze()
source_mask = source["attention_mask"].squeeze()
tokenizer.decode(g, skip_special_tokens=True,
clean_up_tokenization_spaces=True)
out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
return out_text[0]
ids = data["source_ids"].to(device, dtype=torch.long)
mask = data["source_mask"].to(device, dtype=torch.long)
y = data["target_ids"].to(device, dtype=torch.long)
y_ids = y[:, :-1].contiguous()
lm_labels = y[:, 1:].clone().detach()
lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
outputs = model(
input_ids=ids,
attention_mask=mask,
decoder_input_ids=y_ids,
labels=lm_labels, # lm_labels [B, T]
)
loss_sentence = outputs[0]
lm_logits = outputs[1] # [B=32, T=255, V=32128]
generated_ids = model_to_eval.generate(
input_ids = ids,
attention_mask = mask,
max_length=max_length,
num_beams=2,
repetition_penalty=2.5,
length_penalty=1.0,
early_stopping=True
)
out = model.generate(
**encoding, return_dict_in_generate=True,
output_scores=False, max_length=max_length,
do_sample=True, top_p=top_p,
top_k=50,
num_beams=num_beams,
length_penalty=length_penalty,
early_stopping=early_stopping,
repetition_penalty=repetition_penalty
)
out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
return out_text[0]
top_k:top-k-filtering 算法保留多少个 最高概率的词 作为候选,默认50。
top_p:已知生成各个词的总概率是1(即默认是1.0)如果top_p小于1,则从高到低累加直到top_p,取这前N个词作为候选。
repetition_penalty:默认是1.0,重复词惩罚。
length_penalty:长度惩罚,默认是1.0。
length_penalty=1.0:beam search分数会受到生成序列长度的惩罚
length_penalty=0.0:无惩罚
length_penalty<0.0:鼓励模型生成长句子
length_penalty>0.0:鼓励模型生成短句子
early_stopping:是否在至少生成 num_beams 个句子后停止 beam search,默认是False。
详细参数参考:https://www.yii666.com/blog/378788.html
附录:
T5源码使用:https://zhuanlan.zhihu.com/p/455216504