添加特殊Token,保证模型不把它拆分,用作标记之用
import torch
from transformers import RobertaModel, RobertaConfig, RobertaTokenizer
# Roberta模型
pretrained_weights = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights)
model = RobertaModel.from_pretrained(pretrained_weights)
special_tokens_dict = {'additional_special_tokens': ["<#>", "<$>"]}
# 未添加特殊Token之前
print(tokenizer.encode("<#> this <#>"))
print(tokenizer.encode(" the <#> this <#> a <$> body <$> end "))
print("#"*20)
special_tokens_dict = {'additional_special_tokens': ["<#>", "<$>"]}
# tokenizer.SPECIAL_TOKENS_ATTRIBUTES.append("jin_token")
print(tokenizer.SPECIAL_TOKENS_ATTRIBUTES)
# 添加特殊Token, 使模型不会拆分, 用作标记使用
tokenizer.add_special_tokens(special_tokens_dict)
print(tokenizer.additional_special_tokens)
print(tokenizer.additional_special_tokens_ids)
print(tokenizer.encode("<#> this <#>"))
print(tokenizer.encode(" the <#> this <#> a <$> body <$> end "))
输出结果:
[41552, 10431, 15698, 42, 28696, 10431, 15698]
[0, 627, 28696, 10431, 15698, 42, 28696, 10431, 15698, 10, 28696, 1629, 15698, 809, 28696, 1629, 15698, 253, 2]
####################
['bos_token', 'eos_token', 'unk_token', 'sep_token', 'pad_token', 'cls_token', 'mask_token', 'additional_special_tokens']
['<#>', '<$>']
[50265, 50266]
[50265, 9226, 50265]
[0, 627, 50265, 9226, 50265, 102, 50266, 9773, 50266, 1397, 2]
添加特殊Token,之后 若遇到 以下错误:
Creating MTGP constants failed. at /pytorch/aten/src/THC/THCTensorRandom.cu:33
使用因为 添加Token之后使用Roberta模型之前,没有调整模型嵌入矩阵的大小( resized the model's embedding matrix )
使用以下代码解决:
roberta = RobertaModel.from_pretrained(pretrained_weights)
roberta.resize_token_embeddings(len(tokenizer)) # 调整嵌入矩阵的大小
Roberta文档中给的例子:
# Let's see how to add a new classification token to GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
special_tokens_dict = {'cls_token': ''}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
assert tokenizer.cls_token == ''
链接:https://huggingface.co/transformers/main_classes/tokenizer.html#