首先先放一段t5模型的调用的代码
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
#input_ids = tokenizer('translate English to Chinese: I love you China.')
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
输出的结果为
Das Haus ist wunderbar.
模型由一个Embedding和6个T5Stack的encoder部分以及6个T5Stack的decoder部分组成
T5ForConditionalGeneration(
(shared): Embedding(32128, 512)
(encoder): T5Stack(
(embed_tokens): Embedding(32128, 512)
(block): ModuleList(
(0): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(1): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(2): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(3): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(4): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(5): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(final_layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(decoder): T5Stack(
(embed_tokens): Embedding(32128, 512)
(block): ModuleList(
(0): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(1): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(2): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(3): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(4): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(5): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(final_layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(lm_head): Linear(in_features=512, out_features=32128, bias=False)
)
总结起来T5的模型结构就是
T5ForConditionalGeneration(
Embedding(32128,512)
6个T5Block构成T5Stack(每个T5Stack的结构如下)
(
T5LayerSelfAttention
(3个线性层加上一个relative_attention_bias)
T5LayerNorm
Dropout(0.1)
Linear(512,out_features=2048,bias=False)
Linear(2048,out_features=512,bias=False)
Dropout(0.1)
T5LayerNorm
Dropout(0.1)
)
Embedding(32128,512)
6个T5Block构成T5Stack(每个T5Stack的结构如下)
(
T5LayerSelfAttention
T5LayerNorm()
Dropout(0.1)
T5LayerCrossAttention
T5LayerNorm()
Dropout(0.1)
Linear(512,2048,bias=False)
Linear(2048,512,bias=False)
Dropout(0.1)
)
T5LayerNorm()
Dropout(0.1,inplace=False)
Linear(512,32128,bias=False)
)
注意attention的网络层之中bias的值均为False
关注的重点T5LayerCrossAttention和T5LayerSelfAttention的网络结构
这里面调用模型例子的过程也可以在transformer模型的注释之中得到
初始化的过程中
self.model_dim = 512
config.vocab_size = 32128
config.d_model = 512
这里将config的全部参数内容记录一下
T5Config {
"_name_or_path": "t5-small",
"architectures": [
"T5WithLMHeadModel"
],
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dropout_rate": 0.1,
"eos_token_id": 1,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"transformers_version": "4.10.3",
"use_cache": true,
"vocab_size": 32128
}
接下来查看一下self.encoder和self.decoder部分
self.encoder中的参数改变
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
最后将配置的参数放入T5Stack之中
self.encoder = T5Stack(encoder_config,self.shared)
然后经历decoder_config的操作
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = T5Stack(decoder_config, self.shared)
最后面来一个反向的网络层结构
self.lm_head = nn.Linear(config.d_model,config.vocab_size,bias=False)
在前向传播之前,我们有必要先查看一下输入的input_ids的内容
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
接下来需要调用generate产生的函数,最后才要调用forward前向传播函数的内容
use_cache = True
return_dict = True
head_mask = None
decoder_head_mask = None
这里首先会判断head_mask和decoder_head_mask是否为None
if head_mask is not None adn decoder_head_mask is None:
if self.config.num_layers == self.config.num_decoder_layers:
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
decoder_head_mask = head_mask
这里如果head_mask不为None并且decoder_head_mask为None的时候,会将decoder_head_mask的值赋值为head_mask的值
查看这里的T5ForConditionalGeneration中的forward函数
def forward(
self,
input_ids=None,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
encoder_outputs=None,
past_key_values=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
labels=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
......
这里由于需要使用到encoder_outputs,所以我们选择上来先输出一个encoder_outputs的内容
BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.0154, 0.1263, 0.0301, ..., -0.0117, 0.0373, 0.1015],
[-0.1926, -0.1285, 0.0228, ..., -0.0339, 0.0535, 0.1575],
[ 0.0109, -0.0210, 0.0022, ..., 0.0008, -0.0056, -0.0393],
...,
[-0.1581, -0.0719, 0.0208, ..., -0.1778, 0.1037, -0.1703],
[ 0.0142, -0.1430, 0.0148, ..., 0.0224, -0.1906, -0.0547],
[ 0.0756, -0.0119, -0.0273, ..., -0.0044, -0.0505, 0.0554]]]), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)
发现这里不知怎么冒出来一个BaseModelOutputWIthPastAndCrossAttentions类,并且我们的encoder_outputs的内容与BaseModelOutputWithPastAndCrossAttentions类别的内容有关
解决方法:这里查看BaseModelOutputWIthPastAndCrossAttentions定义
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
查看modeling_outputs.py之中的BaseModelOutputWithPastAndCrossAttentions中类的定义
class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
last_hidden_state: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
这里可以看到BaseModelOutputWithPastAndCrossAttentions之中的各种属性的内容
输出对应的encoder_outputs.last_hidden_state.size()的形状
print(encoder_outputs.last_hidden_state.size())
这里输出的形状为(1,11,512),而输入的input_ids的本身的内容为(1,11),
这里的输入的output_ids = (1,11,512),而input_ids = None,这里需要查看output_ids的调用的情况
而这里的关键是需要找到model.generate函数的调用情况
然而,这里找来找去,始终没有找到generate的方法,所以这里不得不采用直接输出generate函数的形式
print(model.generate)
得到一个绑定的generator函数内容
model.generate =
接下来继续向前寻找,最终在pretrain模块之中找到了对应的部分
transformers modeling_utils.py之中的定义
class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin):
接着继续向前查看,找到了generation_utils.py之中
class GenerationMixin:
......
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.LongTensor] = None,
......
**model_kwargs,
) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
读一下它的对应的注释:
Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
multinomial sampling, beam-search decoding, and beam-search multinomial sampling.
也就是说,这里根据输入部分input_ids以及相应的参数从而获取对应的输出output_ids
这里由于产生output_ids的过程较为复杂,所以这一段代码我们先跳过,继续阅读后续的代码
查看T5model的注释可以便于理解这一部分的内容
from transformers import T5Tokenizer, T5Model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5Model.from_pretrained('t5-small')
input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
last_hidden_states = outputs.last_hidden_state
需要两部分的输入:input_ids和decoder_input_ids这两部分的输入
也就是说这里的encoder部分和decoder部分需要分别进行输入内容,这里我们尝试着自定义一下input_ids的内容
input_ids = torch.tensor([[13959,1566,12,2968,10,37,629,19,1627,5,1]])
然后通过encoder部分之后输出的内容仍然是一致的
if encoder_outputs is None:
# Convert encoder inputs in embeddings if needed
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
输出的矩阵内容为
encoder_outputs =
BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.0154, 0.1263, 0.0301, ..., -0.0117, 0.0373, 0.1015],
[-0.1926, -0.1285, 0.0228, ..., -0.0339, 0.0535, 0.1575],
[ 0.0109, -0.0210, 0.0022, ..., 0.0008, -0.0056, -0.0393],
...,
[-0.1581, -0.0719, 0.0208, ..., -0.1778, 0.1037, -0.1703],
[ 0.0142, -0.1430, 0.0148, ..., 0.0224, -0.1906, -0.0547],
[ 0.0756, -0.0119, -0.0273, ..., -0.0044, -0.0505, 0.0554]]]), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)
接下来取出对应的encoder部分的输出内容
hidden_states = encoder_outputs[0]
得到对应的hidden_states的内容
hidden_states = (1,11,512)
而且这里是一个一个输入进去的,原始的id内容为
input_ids =
tensor([[13959,1566,12,2968,,10,37,629,19,1627,5,1]])
输入的input_ids内容
input_ids = tensor([[0]])
decoder_input_ids = tensor([[0]])
input_ids = tensor([[644]])
decoder_input_ids = tensor([[644]])
input_ids = tensor([[4598]])
input_ids = tensor([[229]])
input_ids = tensor([[19250]])
input_ids = tensor([[5]])
感觉这里的input_ids和decoder_input_ids为输出的预测的内容
这里的self.encoder和self.decoder都使用的T5Stack类别,只是传入的参数不一样
self.encoder = T5Stack(encoder_config,self.shared)
self.decoder = T5Stack(decoder_config,self.shared)
接下来进入T5Stack之中查看网络的模型结构
进入forward内容之中
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
这里的参数之中
use_cache = Fasle
output_attentions = False
output_hidden_states = False
return_dict = True
接下来查看形状
batch_size,seq_length = input_shape
获得内容
batch_size = 1
seq_length = 11
最后调用mask_seq_length部分
mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
获得对应的参数
mask_seq_length = 11
接下来查看attention_mask的内容
if attention_mask is None:
attention_mask = torch.ones(batch_size,mask_seq_length).to(inputs_embeds.device)
得到对应的attention_mask的内容
attention_mask = tensor([[1.]])
接下来这部分在encoder部分暂时没有使用,在decoder部分有所使用
if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
encoder_seq_length = encoder_hidden_states.shape[1]
encoder_attention_mask = torch.ones(
batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
)
接下来这段调用的是
past_key_values = [None]*len(self.block)
这里的self.block = 6,所以这里调用完成之后的past_key_values的对应内容为
past_key_values = [None,None,None,None,None,None]
接下来调用extended_attention_mask的内容
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device)
得到extended_attention_mask的内容
这里我特别实验了一下后面加上的mask的内容
extended_attention_mask =
tensor([[[[ -0., -0., -0., -0., -0., -0., -0.,
-0., -0., -0., -0., -10000., -10000., -10000.,
-10000.]]]])
如果后面有填充0的花,这里后面的0使用-10000进行补充上去
接下来发现,循环之中取出来的是T5Block的内容
for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
这里从循环中取出来的self.block的每一部分的layer_module中的内容为T5Block类中的内容
self.layer = nn.ModuleList()
self.layer.append(T5LayerSelfAttention(config,has_relative_attention_bias=has_relative_attention_bias))
if self.is_decoder:
self.layer.append(T5LayerCrossAttention(config))
self.layer.append(T5LayerFF(config))
这里可以看出T5Block之中的结构
T5LayerSelfAttention
T5LayerCrossAttention
T5LayerFF
首先我们查看一下T5LayerSelfAttention的网络结构
self.SelfAttention = T5Attention(config,has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
这里T5Attention模型必须先过个Layernormalization正则化网络层,然后再进行T5Attention的结构的调用
normed_hidden_states = self.layer_norm(hidden_states)
然后再进行T5Attention的结构的调用
attention_output = self.EncDecAttention(
......
)
这里借鉴一下bert4keras
通过bert4keras的结构来看,必须先保留layernormalization前面的网络结构
# Self Attention
xi = x
x = self.apply(
inputs=self.simplify([x, z]),
layer=LayerNormalization,
center=False,
epsilon=1e-6,
conditional=(z is not None),
hidden_units=self.layer_norm_conds[1],
hidden_activation=self.layer_norm_conds[2],
hidden_initializer=self.initializer,
name='%s-Norm' % attention_name
)
x = self.apply(
inputs=[x, x, x, position_bias],
layer=MultiHeadAttention,
arguments={
'p_bias': 't5_relative'},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=attention_name
)
x = self.apply(
inputs=x,
layer=Dropout,
rate=self.dropout_rate,
name='%s-Dropout' % attention_name
)
x = self.apply(
inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
)
…
这里利用残差保留最初的参数,在后面加上去
回到T5LayerSelfAttention的网络结构之中(本质上也使用了残差模型,结构一样)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
#hidden_states+self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
return outputs
查看这里经过self.SelfAttention网络层的过程