auto_tokenizer = transformers.AutoTokenizer.from_pretrained(config.pretrained_model_path)
input_ids: list[int],
token_type_ids: list[int] if return_token_type_ids is True (default)
attention_mask: list[int] if return_attention_mask is True (default)
overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True``
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``and return_special_tokens_mask is True
from transformers import AutoTokenizer #还有其他与模型相关的tokenizer,如BertTokenizer
tokenizer=AutoTokenizer.from_pretrained('bert-base-cased') #这里使用的是bert的基础版(12层),区分大小写,实例化一个tokenizer
batch_sentences=["Hello I'm a single sentence","And another sentence","And the very very last one"]
batch=tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
import transformers
import config
# --------------------------------------- 使用 BertTokenizer ---------------------------------------
origin_tokenizer = transformers.BertTokenizer.from_pretrained(config.pretrained_model_path)
origin_result01 = origin_tokenizer('对比原始的分词和最新的分词器', padding=True, truncation=True, max_length=13, return_tensors='pt')
print("origin_result01 = ", origin_result01)
print("-" * 100)
origin_result02 = origin_tokenizer('展示不同的分词效果', padding=True, truncation=True, max_length=13, return_tensors='pt')
print("origin_result02 = ", origin_result02)
print("-" * 100)
origin_result03 = origin_tokenizer(*[['对比原始的分词和最新的分词器', '展示不同的分词效果']], padding=True, truncation=True, max_length=13, return_tensors='pt')
print("origin_result03 = ", origin_result03)
print("-" * 200)
origin_result04 = origin_tokenizer.convert_tokens_to_ids(origin_tokenizer.tokenize('对比原始的分词和最新的分词器'))
origin_result05 = origin_tokenizer.convert_tokens_to_ids(origin_tokenizer.tokenize('展示不同的分词效果'))
print("origin_result04 = ", origin_result04)
print("-" * 100)
print("origin_result05 = ", origin_result05)
print('\n', '*' * 400, '\n')
# --------------------------------------- 使用 AutoTokenizer ---------------------------------------
auto_tokenizer = transformers.AutoTokenizer.from_pretrained(config.pretrained_model_path)
auto_result01 = auto_tokenizer('对比原始的分词和最新的分词器', padding=True, truncation=True, max_length=13, return_tensors='pt')
print("auto_result01 = ", auto_result01)
print("-" * 100)
auto_result02 = auto_tokenizer('展示不同的分词效果', padding=True, truncation=True, max_length=13, return_tensors='pt')
print("auto_result02 = ", auto_result02)
print("-" * 100)
auto_result03 = auto_tokenizer(*[['对比原始的分词和最新的分词器', '展示不同的分词效果']], padding=True, truncation=True, max_length=13, return_tensors='pt')
print("auto_result03 = ", auto_result03)
print("-" * 200)
auto_result04 = auto_tokenizer.convert_tokens_to_ids(auto_tokenizer.tokenize('对比原始的分词和最新的分词器'))
auto_result05 = auto_tokenizer.convert_tokens_to_ids(auto_tokenizer.tokenize('展示不同的分词效果'))
print("auto_result04 = ", auto_result04)
print("-" * 100)
print("auto_result05 = ", auto_result05)
print("-" * 400)
C:\Program_Files_AI\Anaconda3531\python.exe C:/Users/Admin/OneDrive/WorkSpace_AI/0-基于知识库的智能问答系统-华控智加/01-意图识别/models/
origin_result01 = {'input_ids': tensor([[ 101, 2190, 3683, 1333, 1993, 4638, 1146, 6404, 1469, 3297, 3173, 4638,
102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
origin_result02 = {'input_ids': tensor([[ 101, 2245, 4850, 679, 1398, 4638, 1146, 6404, 3126, 3362, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
origin_result03 = {'input_ids': tensor([[ 101, 2190, 3683, 1333, 1993, 4638, 1146, 6404, 1469, 3297, 3173, 4638,
[ 101, 2245, 4850, 679, 1398, 4638, 1146, 6404, 3126, 3362, 102, 0,
0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}
origin_result04 = [2190, 3683, 1333, 1993, 4638, 1146, 6404, 1469, 3297, 3173, 4638, 1146, 6404, 1690]
origin_result05 = [2245, 4850, 679, 1398, 4638, 1146, 6404, 3126, 3362]
Ignored unknown kwarg option direction
auto_result01 = {'input_ids': tensor([[ 101, 2190, 3683, 1333, 1993, 4638, 1146, 6404, 1469, 3297, 3173, 4638,
102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Ignored unknown kwarg option direction
auto_result02 = {'input_ids': tensor([[ 101, 2245, 4850, 679, 1398, 4638, 1146, 6404, 3126, 3362, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Ignored unknown kwarg option direction
auto_result03 = {'input_ids': tensor([[ 101, 2190, 3683, 1333, 1993, 4638, 1146, 6404, 1469, 3297, 3173, 4638,
[ 101, 2245, 4850, 679, 1398, 4638, 1146, 6404, 3126, 3362, 102, 0,
0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}
auto_result04 = [2190, 3683, 1333, 1993, 4638, 1146, 6404, 1469, 3297, 3173, 4638, 1146, 6404, 1690]
auto_result05 = [2245, 4850, 679, 1398, 4638, 1146, 6404, 3126, 3362]
Process finished with exit code 0
【Huggingface Transformers】保姆级使用教程—上
【Huggingface Transformers】保姆级使用教程—上