【PyTorch】state_dict详解(一层encoder是三百万参数,一层decoder是四百万参数,主要是在位置参数的嵌入上(一层位置参数在一百万))
在pytorch中,torch.nn.Module模块中的state_dict变量存放训练过程中需要学习的权重和偏执系数,state_dict作为python的字典对象将每一层的参数映射成tensor张量,需要注意的是torch.nn.Module模块中的state_dict只包含卷积层和全连接层的参数,当网络中存在batchnorm时,例如vgg网络结构,torch.nn.Module模块中的state_dict也会存放batchnorm’s running_mean,关于batchnorm详解可见
encoder.linear_in.weight
encoder.linear_in.bias
encoder.layer_norm_in.weight
encoder.layer_norm_in.bias
encoder.positional_encoding.pe
encoder.layer_stack.0.slf_attn.w_qs.weight
encoder.layer_stack.0.slf_attn.w_qs.bias
encoder.layer_stack.0.slf_attn.w_ks.weight
encoder.layer_stack.0.slf_attn.w_ks.bias
encoder.layer_stack.0.slf_attn.w_vs.weight
encoder.layer_stack.0.slf_attn.w_vs.bias
encoder.layer_stack.0.slf_attn.layer_norm.weight
encoder.layer_stack.0.slf_attn.layer_norm.bias
encoder.layer_stack.0.slf_attn.fc.weight
encoder.layer_stack.0.slf_attn.fc.bias
encoder.layer_stack.0.pos_ffn.w_1.weight
encoder.layer_stack.0.pos_ffn.w_1.bias
encoder.layer_stack.0.pos_ffn.w_2.weight
encoder.layer_stack.0.pos_ffn.w_2.bias
encoder.layer_stack.0.pos_ffn.layer_norm.weight
encoder.layer_stack.0.pos_ffn.layer_norm.bias
encoder.layer_stack.1.slf_attn.w_qs.weight
encoder.layer_stack.1.slf_attn.w_qs.bias
encoder.layer_stack.1.slf_attn.w_ks.weight
encoder.layer_stack.1.slf_attn.w_ks.bias
encoder.layer_stack.1.slf_attn.w_vs.weight
encoder.layer_stack.1.slf_attn.w_vs.bias
encoder.layer_stack.1.slf_attn.layer_norm.weight
encoder.layer_stack.1.slf_attn.layer_norm.bias
encoder.layer_stack.1.slf_attn.fc.weight
encoder.layer_stack.1.slf_attn.fc.bias
encoder.layer_stack.1.pos_ffn.w_1.weight
encoder.layer_stack.1.pos_ffn.w_1.bias
encoder.layer_stack.1.pos_ffn.w_2.weight
encoder.layer_stack.1.pos_ffn.w_2.bias
encoder.layer_stack.1.pos_ffn.layer_norm.weight
encoder.layer_stack.1.pos_ffn.layer_norm.bias
encoder.layer_stack.2.slf_attn.w_qs.weight
encoder.layer_stack.2.slf_attn.w_qs.bias
encoder.layer_stack.2.slf_attn.w_ks.weight
encoder.layer_stack.2.slf_attn.w_ks.bias
encoder.layer_stack.2.slf_attn.w_vs.weight
encoder.layer_stack.2.slf_attn.w_vs.bias
encoder.layer_stack.2.slf_attn.layer_norm.weight
encoder.layer_stack.2.slf_attn.layer_norm.bias
encoder.layer_stack.2.slf_attn.fc.weight
encoder.layer_stack.2.slf_attn.fc.bias
encoder.layer_stack.2.pos_ffn.w_1.weight
encoder.layer_stack.2.pos_ffn.w_1.bias
encoder.layer_stack.2.pos_ffn.w_2.weight
encoder.layer_stack.2.pos_ffn.w_2.bias
encoder.layer_stack.2.pos_ffn.layer_norm.weight
encoder.layer_stack.2.pos_ffn.layer_norm.bias
encoder.layer_stack.3.slf_attn.w_qs.weight
encoder.layer_stack.3.slf_attn.w_qs.bias
encoder.layer_stack.3.slf_attn.w_ks.weight
encoder.layer_stack.3.slf_attn.w_ks.bias
encoder.layer_stack.3.slf_attn.w_vs.weight
encoder.layer_stack.3.slf_attn.w_vs.bias
encoder.layer_stack.3.slf_attn.layer_norm.weight
encoder.layer_stack.3.slf_attn.layer_norm.bias
encoder.layer_stack.3.slf_attn.fc.weight
encoder.layer_stack.3.slf_attn.fc.bias
encoder.layer_stack.3.pos_ffn.w_1.weight
encoder.layer_stack.3.pos_ffn.w_1.bias
encoder.layer_stack.3.pos_ffn.w_2.weight
encoder.layer_stack.3.pos_ffn.w_2.bias
encoder.layer_stack.3.pos_ffn.layer_norm.weight
encoder.layer_stack.3.pos_ffn.layer_norm.bias
encoder.layer_stack.4.slf_attn.w_qs.weight
encoder.layer_stack.4.slf_attn.w_qs.bias
encoder.layer_stack.4.slf_attn.w_ks.weight
encoder.layer_stack.4.slf_attn.w_ks.bias
encoder.layer_stack.4.slf_attn.w_vs.weight
encoder.layer_stack.4.slf_attn.w_vs.bias
encoder.layer_stack.4.slf_attn.layer_norm.weight
encoder.layer_stack.4.slf_attn.layer_norm.bias
encoder.layer_stack.4.slf_attn.fc.weight
encoder.layer_stack.4.slf_attn.fc.bias
encoder.layer_stack.4.pos_ffn.w_1.weight
encoder.layer_stack.4.pos_ffn.w_1.bias
encoder.layer_stack.4.pos_ffn.w_2.weight
encoder.layer_stack.4.pos_ffn.w_2.bias
encoder.layer_stack.4.pos_ffn.layer_norm.weight
encoder.layer_stack.4.pos_ffn.layer_norm.bias
encoder.layer_stack.5.slf_attn.w_qs.weight
encoder.layer_stack.5.slf_attn.w_qs.bias
encoder.layer_stack.5.slf_attn.w_ks.weight
encoder.layer_stack.5.slf_attn.w_ks.bias
encoder.layer_stack.5.slf_attn.w_vs.weight
encoder.layer_stack.5.slf_attn.w_vs.bias
encoder.layer_stack.5.slf_attn.layer_norm.weight
encoder.layer_stack.5.slf_attn.layer_norm.bias
encoder.layer_stack.5.slf_attn.fc.weight
encoder.layer_stack.5.slf_attn.fc.bias
encoder.layer_stack.5.pos_ffn.w_1.weight
encoder.layer_stack.5.pos_ffn.w_1.bias
encoder.layer_stack.5.pos_ffn.w_2.weight
encoder.layer_stack.5.pos_ffn.w_2.bias
encoder.layer_stack.5.pos_ffn.layer_norm.weight
encoder.layer_stack.5.pos_ffn.layer_norm.bias
decoder.tgt_word_emb.weight
decoder.positional_encoding.pe
decoder.layer_stack.0.slf_attn.w_qs.weight
decoder.layer_stack.0.slf_attn.w_qs.bias
decoder.layer_stack.0.slf_attn.w_ks.weight
decoder.layer_stack.0.slf_attn.w_ks.bias
decoder.layer_stack.0.slf_attn.w_vs.weight
decoder.layer_stack.0.slf_attn.w_vs.bias
decoder.layer_stack.0.slf_attn.layer_norm.weight
decoder.layer_stack.0.slf_attn.layer_norm.bias
decoder.layer_stack.0.slf_attn.fc.weight
decoder.layer_stack.0.slf_attn.fc.bias
decoder.layer_stack.0.enc_attn.w_qs.weight
decoder.layer_stack.0.enc_attn.w_qs.bias
decoder.layer_stack.0.enc_attn.w_ks.weight
decoder.layer_stack.0.enc_attn.w_ks.bias
decoder.layer_stack.0.enc_attn.w_vs.weight
decoder.layer_stack.0.enc_attn.w_vs.bias
decoder.layer_stack.0.enc_attn.layer_norm.weight
decoder.layer_stack.0.enc_attn.layer_norm.bias
decoder.layer_stack.0.enc_attn.fc.weight
decoder.layer_stack.0.enc_attn.fc.bias
decoder.layer_stack.0.pos_ffn.w_1.weight
decoder.layer_stack.0.pos_ffn.w_1.bias
decoder.layer_stack.0.pos_ffn.w_2.weight
decoder.layer_stack.0.pos_ffn.w_2.bias
decoder.layer_stack.0.pos_ffn.layer_norm.weight
decoder.layer_stack.0.pos_ffn.layer_norm.bias
decoder.layer_stack.1.slf_attn.w_qs.weight
decoder.layer_stack.1.slf_attn.w_qs.bias
decoder.layer_stack.1.slf_attn.w_ks.weight
decoder.layer_stack.1.slf_attn.w_ks.bias
decoder.layer_stack.1.slf_attn.w_vs.weight
decoder.layer_stack.1.slf_attn.w_vs.bias
decoder.layer_stack.1.slf_attn.layer_norm.weight
decoder.layer_stack.1.slf_attn.layer_norm.bias
decoder.layer_stack.1.slf_attn.fc.weight
decoder.layer_stack.1.slf_attn.fc.bias
decoder.layer_stack.1.enc_attn.w_qs.weight
decoder.layer_stack.1.enc_attn.w_qs.bias
decoder.layer_stack.1.enc_attn.w_ks.weight
decoder.layer_stack.1.enc_attn.w_ks.bias
decoder.layer_stack.1.enc_attn.w_vs.weight
decoder.layer_stack.1.enc_attn.w_vs.bias
decoder.layer_stack.1.enc_attn.layer_norm.weight
decoder.layer_stack.1.enc_attn.layer_norm.bias
decoder.layer_stack.1.enc_attn.fc.weight
decoder.layer_stack.1.enc_attn.fc.bias
decoder.layer_stack.1.pos_ffn.w_1.weight
decoder.layer_stack.1.pos_ffn.w_1.bias
decoder.layer_stack.1.pos_ffn.w_2.weight
decoder.layer_stack.1.pos_ffn.w_2.bias
decoder.layer_stack.1.pos_ffn.layer_norm.weight
decoder.layer_stack.1.pos_ffn.layer_norm.bias
decoder.layer_stack.2.slf_attn.w_qs.weight
decoder.layer_stack.2.slf_attn.w_qs.bias
decoder.layer_stack.2.slf_attn.w_ks.weight
decoder.layer_stack.2.slf_attn.w_ks.bias
decoder.layer_stack.2.slf_attn.w_vs.weight
decoder.layer_stack.2.slf_attn.w_vs.bias
decoder.layer_stack.2.slf_attn.layer_norm.weight
decoder.layer_stack.2.slf_attn.layer_norm.bias
decoder.layer_stack.2.slf_attn.fc.weight
decoder.layer_stack.2.slf_attn.fc.bias
decoder.layer_stack.2.enc_attn.w_qs.weight
decoder.layer_stack.2.enc_attn.w_qs.bias
decoder.layer_stack.2.enc_attn.w_ks.weight
decoder.layer_stack.2.enc_attn.w_ks.bias
decoder.layer_stack.2.enc_attn.w_vs.weight
decoder.layer_stack.2.enc_attn.w_vs.bias
decoder.layer_stack.2.enc_attn.layer_norm.weight
decoder.layer_stack.2.enc_attn.layer_norm.bias
decoder.layer_stack.2.enc_attn.fc.weight
decoder.layer_stack.2.enc_attn.fc.bias
decoder.layer_stack.2.pos_ffn.w_1.weight
decoder.layer_stack.2.pos_ffn.w_1.bias
decoder.layer_stack.2.pos_ffn.w_2.weight
decoder.layer_stack.2.pos_ffn.w_2.bias
decoder.layer_stack.2.pos_ffn.layer_norm.weight
decoder.layer_stack.2.pos_ffn.layer_norm.bias
decoder.layer_stack.3.slf_attn.w_qs.weight
decoder.layer_stack.3.slf_attn.w_qs.bias
decoder.layer_stack.3.slf_attn.w_ks.weight
decoder.layer_stack.3.slf_attn.w_ks.bias
decoder.layer_stack.3.slf_attn.w_vs.weight
decoder.layer_stack.3.slf_attn.w_vs.bias
decoder.layer_stack.3.slf_attn.layer_norm.weight
decoder.layer_stack.3.slf_attn.layer_norm.bias
decoder.layer_stack.3.slf_attn.fc.weight
decoder.layer_stack.3.slf_attn.fc.bias
decoder.layer_stack.3.enc_attn.w_qs.weight
decoder.layer_stack.3.enc_attn.w_qs.bias
decoder.layer_stack.3.enc_attn.w_ks.weight
decoder.layer_stack.3.enc_attn.w_ks.bias
decoder.layer_stack.3.enc_attn.w_vs.weight
decoder.layer_stack.3.enc_attn.w_vs.bias
decoder.layer_stack.3.enc_attn.layer_norm.weight
decoder.layer_stack.3.enc_attn.layer_norm.bias
decoder.layer_stack.3.enc_attn.fc.weight
decoder.layer_stack.3.enc_attn.fc.bias
decoder.layer_stack.3.pos_ffn.w_1.weight
decoder.layer_stack.3.pos_ffn.w_1.bias
decoder.layer_stack.3.pos_ffn.w_2.weight
decoder.layer_stack.3.pos_ffn.w_2.bias
decoder.layer_stack.3.pos_ffn.layer_norm.weight
decoder.layer_stack.3.pos_ffn.layer_norm.bias
decoder.layer_stack.4.slf_attn.w_qs.weight
decoder.layer_stack.4.slf_attn.w_qs.bias
decoder.layer_stack.4.slf_attn.w_ks.weight
decoder.layer_stack.4.slf_attn.w_ks.bias
decoder.layer_stack.4.slf_attn.w_vs.weight
decoder.layer_stack.4.slf_attn.w_vs.bias
decoder.layer_stack.4.slf_attn.layer_norm.weight
decoder.layer_stack.4.slf_attn.layer_norm.bias
decoder.layer_stack.4.slf_attn.fc.weight
decoder.layer_stack.4.slf_attn.fc.bias
decoder.layer_stack.4.enc_attn.w_qs.weight
decoder.layer_stack.4.enc_attn.w_qs.bias
decoder.layer_stack.4.enc_attn.w_ks.weight
decoder.layer_stack.4.enc_attn.w_ks.bias
decoder.layer_stack.4.enc_attn.w_vs.weight
decoder.layer_stack.4.enc_attn.w_vs.bias
decoder.layer_stack.4.enc_attn.layer_norm.weight
decoder.layer_stack.4.enc_attn.layer_norm.bias
decoder.layer_stack.4.enc_attn.fc.weight
decoder.layer_stack.4.enc_attn.fc.bias
decoder.layer_stack.4.pos_ffn.w_1.weight
decoder.layer_stack.4.pos_ffn.w_1.bias
decoder.layer_stack.4.pos_ffn.w_2.weight
decoder.layer_stack.4.pos_ffn.w_2.bias
decoder.layer_stack.4.pos_ffn.layer_norm.weight
decoder.layer_stack.4.pos_ffn.layer_norm.bias
decoder.layer_stack.5.slf_attn.w_qs.weight
decoder.layer_stack.5.slf_attn.w_qs.bias
decoder.layer_stack.5.slf_attn.w_ks.weight
decoder.layer_stack.5.slf_attn.w_ks.bias
decoder.layer_stack.5.slf_attn.w_vs.weight
decoder.layer_stack.5.slf_attn.w_vs.bias
decoder.layer_stack.5.slf_attn.layer_norm.weight
decoder.layer_stack.5.slf_attn.layer_norm.bias
decoder.layer_stack.5.slf_attn.fc.weight
decoder.layer_stack.5.slf_attn.fc.bias
decoder.layer_stack.5.enc_attn.w_qs.weight
decoder.layer_stack.5.enc_attn.w_qs.bias
decoder.layer_stack.5.enc_attn.w_ks.weight
decoder.layer_stack.5.enc_attn.w_ks.bias
decoder.layer_stack.5.enc_attn.w_vs.weight
decoder.layer_stack.5.enc_attn.w_vs.bias
decoder.layer_stack.5.enc_attn.layer_norm.weight
decoder.layer_stack.5.enc_attn.layer_norm.bias
decoder.layer_stack.5.enc_attn.fc.weight
decoder.layer_stack.5.enc_attn.fc.bias
decoder.layer_stack.5.pos_ffn.w_1.weight
decoder.layer_stack.5.pos_ffn.w_1.bias
decoder.layer_stack.5.pos_ffn.w_2.weight
decoder.layer_stack.5.pos_ffn.w_2.bias
decoder.layer_stack.5.pos_ffn.layer_norm.weight
decoder.layer_stack.5.pos_ffn.layer_norm.bias
decoder.tgt_word_prj.weight