transformers PreTrainedTokenizer类




1、_ _ call _ _函数

        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,



# text为一个string
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.")
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# text为一个列表
>>> tokenizer(text=["The sailors rode the breeze clear of the rocks."])
{'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# text为一个string组成的列表,此时设置 参数is_split_into_words=True
>>> tokenizer(text=[["The", "sailors", "rode", "the", "breeze", "clear", "of", "the", "rocks"]], is_split_into_words=True)
{'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# text_pair和text的格式应一样,text为列表则text_pair也应为列表
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.",
                      text_pair="I demand that the more John eat, the more he pays.")
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 1996, 2062, 2002, 12778, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# add_special_tokens=False,不添加特殊标记
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.",add_special_tokens=False)
{'input_ids': [1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# 默认为True,添加特殊标记
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.")
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

>>> encodings = tokenizer(text=["The sailors rode the breeze clear of the rocks."],add_special_tokens=True)
>>> tokenizer.batch_decode(encodings["input_ids"])
['[CLS] the sailors rode the breeze clear of the rocks. [SEP]']



# padding为True,则设置填充,填充后的长度与句子最长的长度相等
>>> encodings = tokenizer(text=["The sailors rode the breeze clear of the rocks.","I demand that the more John eat, the more he pays."],padding=True)
>>> encodings 
{'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102, 0, 0, 0], [101, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 1996, 2062, 2002, 12778, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
>>> tokenizer.batch_decode(encodings["input_ids"])
['[CLS] the sailors rode the breeze clear of the rocks. [SEP] [PAD] [PAD] [PAD]', '[CLS] i demand that the more john eat, the more he pays. [SEP]']

# truncation为True,设置截断,默认情况下,截断后的长度与句子最长长度相同,故不会截断任何句子
# 此时,可以通过设置max_length的值来指定截断长度。
>>> encodings = tokenizer(text=["The sailors rode the breeze clear of the rocks.","I demand that the more John eat, the more he pays."],truncation=True)
>>> encodings
{'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], [101, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 1996, 2062, 2002, 12778, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# truncation为True,同时设置max_length=10。
>>> encodings = tokenizer(text=["The sailors rode the breeze clear of the rocks.","I demand that the more John eat, the more he pays."],truncation=True, max_length=10)
>>> encodings
{'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 102], [101, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


>>> encodings = tokenizer(text=["The sailors rode the breeze clear of the rocks.","I demand that the more John eat, the more he pays."],truncation=True, max_length=10, stride=2, return_overflowing_tokens=True)
>>> encodings
# 此处overflowing_tokens中的[1997,1996]就是重叠的标记,个数为2,与参数stride=2设置的一致
{'overflowing_tokens': [[1997, 1996, 5749, 1012], [4521, 1010, 1996, 2062, 2002, 12778, 1012]], 'num_truncated_tokens': [2, 5], 'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 102], [101, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


>>> encodings = tokenizer(text="The sailors rode the breeze clear of the rocks.",return_tensors="tf")
>>> type(encodings["input_ids"])

>>> encodings = tokenizer(text="The sailors rode the breeze clear of the rocks.",return_tensors="np")
>>> type(encodings["input_ids"])


# 只有一个分句,token_type_ids全为0
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.",return_token_type_ids=True)
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# 有两个分句,第二个分句token_type_ids为1
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.",text_pair="I demand that the more John eat, the more he pays.",return_token_type_ids=True)
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 1996, 2062, 2002, 12778, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# overflowing_tokens表示截断的单词
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.",truncation=True,max_length=10,return_overflowing_tokens=True)
{'overflowing_tokens': [5749, 1012], 'num_truncated_tokens': 2, 'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# special_tokens_mask使用特殊标记字符[CLS]、[SEQ]等的地方标记为1,其他为0
>>> tokenizer(text="The sailors rode the breeze clear of the rocks.",return_special_tokens_mask=True)
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# length表示各个句子的长度
>>> tokenizer(text=["The sailors rode the breeze clear of the rocks.", "I demand that the more John eat, the more he pays."],return_length=True)
{'input_ids': [[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], [101, 1045, 5157, 2008, 1996, 2062, 2198, 4521, 1010, 1996, 2062, 2002, 12778, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'length': [12, 15], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


>>> tokenizer("The sailors rode the breeze clear of the rocks.",return_offsets_mapping=True)
{'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (4, 11), (12, 16), (17, 20), (21, 27), (28, 33), (34, 36), (37, 40), (41, 46), (46, 47), (0, 0)]}



        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,


# text可以为一个str
>>> encoding = tokenizer.encode(text="The sailors rode the breeze clear of the rocks.")
>>> encoding
[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102]
# text也可以为一个List[str],这个列表中的每一个str称为一个token
>>> encoding = tokenizer.encode(text=["The", "sailors", "rode", "the", "breeze", "clear", "of", "the", "rocks", "."])
>>> encoding
[101, 100, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102]
# text还可以为一个List[int],每个int为token所对应的id
>>> encodings = tokenizer.encode(text=[100, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012])
>>> encodings
[101, 100, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102]


        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,


>>> decodings = tokenizer.decode([101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102])
>>> decodings
'[CLS] [UNK] sailors rode the breeze clear of the rocks. [SEP]'# 标点符号与单词间没有空格
>>> decodings = tokenizer.decode([101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], clean_up_tokenization_spaces=False)
>>> decodings
'[CLS] [UNK] sailors rode the breeze clear of the rocks . [SEP]' # 标点符号与单词间有空格


        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,


# sequences为一个List[List[int]]
>>> decodings = tokenizer.batch_decode([[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102]])
>>> decodings
['[CLS] the sailors rode the breeze clear of the rocks. [SEP]']


        ids: Union[int, List[int]], skip_special_tokens: bool = False


>>> tokens = tokenizer.convert_ids_to_tokens([101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102])
>>> tokens
['[CLS]', 'the', 'sailors', 'rode', 'the', 'breeze', 'clear', 'of', 'the', 'rocks', '.', '[SEP]']


convert_tokens_to_ids(tokens: Union[str, List[str]])


>>> ids = tokenizer.convert_tokens_to_ids(['[CLS]', 'the', 'sailors', 'rode', 'the', 'breeze', 'clear', 'of', 'the', 'rocks', '.', '[SEP]'])
>>> ids
[101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102]


tokenize(text: TextInput, **kwargs)


>>> tokenizer.tokenize("The sailors rode the breeze clear of the rocks.")
['the', 'sailors', 'rode', 'the', 'breeze', 'clear', 'of', 'the', 'rocks', '.']
