hugging face在NLP领域最出名,其提供的模型大多都是基于Transformer的。为了易用性,Hugging Face还为用户提供了以下几个项目:
将数据预处理tokenizer、模型调用model、结果后处理
组装成一个流水线
pipeline(data, model, tokenizer, divece)
的原理:
一般使用较多的方法是分别构建model
和tokenizer
,并指定task
任务类型将其分别加入pipeline
:
(每类pipeline的具体使用方法可以点进具体Pipeline类的源码中查看!!)
Tokenizer将过去繁琐的text-to-token
的过程进行简化:
Step1 加载与保存
from transformers import AutoTokenizer
# 从HuggingFace加载,输入模型名称,即可加载对应的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
"""
BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
"""
# tokenizer 保存到本地
tokenizer.save_pretrained("本地文件夹路径")
''' 文件夹内的文件格式
('./roberta_tokenizer\\tokenizer_config.json',
'./roberta_tokenizer\\special_tokens_map.json',
'./roberta_tokenizer\\vocab.txt',
'./roberta_tokenizer\\added_tokens.json',
'./roberta_tokenizer\\tokenizer.json')
'''
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("本地文件夹路径")
"""
BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
"""
Step2 句子分词 :
sen = "弱小的我也有大梦想!"
tokens = tokenizer.tokenize(sen)
# ['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']
Step3 查看词典:
tokenizer.vocab
"""
{'湾': 3968,
'訴': 6260,
'##轶': 19824,
'洞': 3822,
' ̄': 8100,
'##劾': 14288,
'##care': 11014,
'asia': 8339,
'##嗑': 14679,
'##鹘': 20965,
'washington': 12262,
'##匕': 14321,
'##樟': 16619,
'癮': 4628,
'day3': 11649,
'##宵': 15213,
'##弧': 15536,
'##do': 8828,
'詭': 6279,
'3500': 9252,
'124': 9377,
'##価': 13957,
'##玄': 17428,
'##積': 18005,
'##肝': 18555,
...
'##维': 18392,
'與': 5645,
'##mark': 9882,
'偽': 984,
...}
"""
tokenizer.vocab_size
# 21128
Step4 索引转换:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids
# [2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens
# ['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen
# '弱 小 的 我 也 有 大 梦 想!'
总结——更便捷的实现方式:
# 将字符串转换为id序列,又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids
# [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]
# 将id序列转换为字符串,又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen
# '[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'
Step5 填充与截断:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids
# [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids
# [101, 2483, 2207, 4638, 102]
Step6 其他输入部分:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids
# [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids
"""
([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
"""
tokenizer.encode_plus()
和tokenizer()
效果相同
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs
"""
{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
"""
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs
"""
{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
"""
sens = ["弱小的我也有大梦想",
"有梦想谁都了不起",
"追逐梦想的心,比梦想本身,更可贵"]
res = tokenizer(sens)
res
"""
{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]],
'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
"""
%%time
# 单条循环处理
for i in range(1000):
tokenizer(sen)
# CPU times: total: 15.6 ms
# Wall time: 32.5 ms
%%time
# 处理batch数据
sen_list = [sen] * 1000
res = tokenizer(sen_list)
# CPU times: total: 0 ns
# Wall time: 6 ms
sen = "弱小的我也有大Dreaming!"
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer
# BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs
# {'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}
inputs.word_ids()
# [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer
# BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
在线下载: 会遇到HTTP连接超时
from transformers import AutoConfig, AutoModel, AutoTokenizer
model = AutoModel.from_pretrained("hfl/rbt3", force_download=True)
离线下载 : 需要挂梯子自己进去下载,在本地创建文件夹
!git clone "https://huggingface.co/hfl/rbt3"
!git lfs clone "https://huggingface.co/hfl/rbt3" --include="*.bin"
离线加载:
model = AutoModel.from_pretrained("本地文件夹")
模型加载参数
model = AutoModel.from_pretrained("本地文件夹")
model.config
"""
BertConfig {
"_name_or_path": "rbt3",
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"output_past": true,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
...
"transformers_version": "4.28.1",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
"""
config = AutoConfig.from_pretrained("./rbt3/")
config
"""
BertConfig {
"_name_or_path": "rbt3",
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"output_past": true,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
...
"transformers_version": "4.28.1",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
"""
sen = "弱小的我也有大梦想!"
tokenizer = AutoTokenizer.from_pretrained("rbt3")
inputs = tokenizer(sen, return_tensors="pt")
inputs
"""
{'input_ids': tensor([[ 101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
"""
不带Model Head的模型调用
model = AutoModel.from_pretrained("rbt3", output_attentions=True)
output = model(**inputs)
output
"""
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6804, 0.6664, 0.7170, ..., -0.4102, 0.7839, -0.0262],
[-0.7378, -0.2748, 0.5034, ..., -0.1359, -0.4331, -0.5874],
[-0.0212, 0.5642, 0.1032, ..., -0.3617, 0.4646, -0.4747],
...,
[ 0.0853, 0.6679, -0.1757, ..., -0.0942, 0.4664, 0.2925],
[ 0.3336, 0.3224, -0.3355, ..., -0.3262, 0.2532, -0.2507],
[ 0.6761, 0.6688, 0.7154, ..., -0.4083, 0.7824, -0.0224]]],
grad_fn=), pooler_output=tensor([[-1.2646e-01, -9.8619e-01, -1.0000e+00, -9.8325e-01, 8.0238e-01,
-6.6268e-02, 6.6919e-02, 1.4784e-01, 9.9451e-01, 9.9995e-01,
-8.3051e-02, -1.0000e+00, -9.8865e-02, 9.9980e-01, -1.0000e+00,
9.9993e-01, 9.8291e-01, 9.5363e-01, -9.9948e-01, -1.3219e-01,
-9.9733e-01, -7.7934e-01, 1.0720e-01, 9.8040e-01, 9.9953e-01,
-9.9939e-01, -9.9997e-01, 1.4967e-01, -8.7627e-01, -9.9996e-01,
-9.9821e-01, -9.9999e-01, 1.9396e-01, -1.1277e-01, 9.9359e-01,
-9.9153e-01, 4.4752e-02, -9.8731e-01, -9.9942e-01, -9.9982e-01,
2.9360e-02, 9.9847e-01, -9.2014e-03, 9.9999e-01, 1.7111e-01,
4.5071e-03, 9.9998e-01, 9.9467e-01, 4.9726e-03, -9.0707e-01,
6.9056e-02, -1.8141e-01, -9.8831e-01, 9.9668e-01, 4.9800e-01,
1.2997e-01, 9.9895e-01, -1.0000e+00, -9.9990e-01, 9.9478e-01,
-9.9989e-01, 9.9906e-01, 9.9820e-01, 9.9990e-01, -6.8953e-01,
9.9990e-01, 9.9987e-01, 9.4563e-01, -3.7660e-01, -1.0000e+00,
1.3151e-01, -9.7371e-01, -9.9997e-01, -1.3228e-02, -2.9801e-01,
-9.9985e-01, 9.9662e-01, -2.0004e-01, 9.9997e-01, 3.6876e-01,
-9.9997e-01, 1.5462e-01, 1.9265e-01, 8.9871e-02, 9.9996e-01,
9.9998e-01, 1.5184e-01, -8.9714e-01, -2.1646e-01, -9.9922e-01,
...
1.7911e-02, 4.8672e-01],
[4.0732e-01, 3.8137e-02, 9.6832e-03, ..., 4.4490e-02,
2.2997e-02, 4.0793e-01],
[1.7047e-01, 3.6989e-02, 2.3646e-02, ..., 4.6833e-02,
2.5233e-01, 1.6721e-01]]]], grad_fn=)), cross_attentions=None)
"""
output.last_hidden_state.size()
# orch.Size([1, 12, 768])
len(inputs["input_ids"][0])
# 12
带Model Head的模型调用
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
clz_model = AutoModelForSequenceClassification.from_pretrained("rbt3", num_labels=10)
clz_model(**inputs)
# SequenceClassifierOutput(loss=None, logits=tensor([[-0.1776, 0.2208, -0.5060, -0.3938, -0.5837, 1.0171, -0.2616, 0.0495, 0.1728, 0.3047]], grad_fn=), hidden_states=None, attentions=None)
clz_model.config.num_labels
# 2
加载在线数据集
from datasets import *
datasets = load_dataset("madao33/new-title-chinese")
datasets
'''
DatasetDict({
train: Dataset({
features: ['title', 'content'],
num_rows: 5850
})
validation: Dataset({
features: ['title', 'content'],
num_rows: 1679
})
})
'''
加载数据集合集中的某一项任务
有的数据集看起来是一个数据集,其实是包含多种数据集的集合,每个子集用于不同是task。
boolq_dataset = load_dataset("super_glue", "boolq")
boolq_dataset
'''
DatasetDict({
train: Dataset({
features: ['question', 'passage', 'idx', 'label'],
num_rows: 9427
})
validation: Dataset({
features: ['question', 'passage', 'idx', 'label'],
num_rows: 3270
})
test: Dataset({
features: ['question', 'passage', 'idx', 'label'],
num_rows: 3245
})
})
'''
按照数据集划分进行加载
默认按train、test、val的3:1:1进行划分,可以选择只加载train,或者只加载train的一部分。
dataset = load_dataset("madao33/new-title-chinese", split="train")
dataset
'''
Dataset({
features: ['title', 'content'],
num_rows: 5850
})
'''
dataset = load_dataset("madao33/new-title-chinese", split="train[10:100]")
dataset
'''
Dataset({
features: ['title', 'content'],
num_rows: 90
})
'''
dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
dataset
'''
Dataset({
features: ['title', 'content'],
num_rows: 2925
})
'''
dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]"])
dataset
'''
[Dataset({
features: ['title', 'content'],
num_rows: 2925
}),
Dataset({
features: ['title', 'content'],
num_rows: 2925
})]
'''
datasets = load_dataset("madao33/new-title-chinese")
datasets
'''
DatasetDict({
train: Dataset({
features: ['title', 'content'],
num_rows: 5850
})
validation: Dataset({
features: ['title', 'content'],
num_rows: 1679
})
})
'''
直接索引查询,可以切片
datasets["train"][0]
datasets["train"][:2]
datasets["train"]["title"][:5]
datasets["train"].column_names
# ['title', 'content']
datasets["train"].features
# {'title': Value(dtype='string', id=None),
# 'content': Value(dtype='string', id=None)}
直接使用dataset.train_test_split()
方法,按比例划分。stratify_by_column使得标签是均衡的
dataset = boolq_dataset["train"]
dataset.train_test_split(test_size=0.1, stratify_by_column="label") # 分类数据集可以按照比例划分
'''
DatasetDict({
train: Dataset({
features: ['question', 'passage', 'idx', 'label'],
num_rows: 8484
})
test: Dataset({
features: ['question', 'passage', 'idx', 'label'],
num_rows: 943
})
})
'''
按index_list
取指定的数据
# 选取
datasets["train"].select([0, 1])
'''
Dataset({
features: ['title', 'content'],
num_rows: 2
})
'''
取满足lambda条件的数据
# 过滤
filter_dataset = datasets["train"].filter(lambda example: "中国" in example["title"])
filter_dataset["title"][:5]
'''
['聚焦两会,世界探寻中国成功秘诀',
'望海楼中国经济的信心来自哪里',
'“中国奇迹”助力世界减贫跑出加速度',
'和音瞩目历史交汇点上的中国',
'中国风采感染世界']
'''
map对每个数据都执行某个函数进行数据处理
def add_prefix(example):
example["title"] = 'Prefix: ' + example["title"]
return example
prefix_dataset = datasets.map(add_prefix)
prefix_dataset["train"][:10]["title"]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def preprocess_function(example, tokenizer=tokenizer):
model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
labels = tokenizer(example["title"], max_length=32, truncation=True)
# label就是title编码的结果
model_inputs["labels"] = labels["input_ids"]
return model_inputs
processed_datasets = datasets.map(preprocess_function)
processed_datasets
'''
DatasetDict({
train: Dataset({
features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 5850
})
validation: Dataset({
features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 1679
})
})
'''
processed_datasets = datasets.map(preprocess_function, num_proc=4) # 多线程处理加速
processed_datasets
'''
DatasetDict({
train: Dataset({
features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 5850
})
validation: Dataset({
features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 1679
})
})
'''
processed_datasets = datasets.map(preprocess_function, batched=True) # 批处理加速
processed_datasets
'''
DatasetDict({
train: Dataset({
features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 5850
})
validation: Dataset({
features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 1679
})
})
'''
processed_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names) # remove_columns删除某些字段
processed_datasets
'''
DatasetDict({
train: Dataset({
features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 5850
})
validation: Dataset({
features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 1679
})
})
'''