【模型参数结构】

【PyTorch】state_dict详解(一层encoder是三百万参数,一层decoder是四百万参数,主要是在位置参数的嵌入上(一层位置参数在一百万))

在pytorch中,torch.nn.Module模块中的state_dict变量存放训练过程中需要学习的权重和偏执系数,state_dict作为python的字典对象将每一层的参数映射成tensor张量,需要注意的是torch.nn.Module模块中的state_dict只包含卷积层和全连接层的参数,当网络中存在batchnorm时,例如vgg网络结构,torch.nn.Module模块中的state_dict也会存放batchnorm’s running_mean,关于batchnorm详解可见

encoder.linear_in.weight

encoder.linear_in.bias

encoder.layer_norm_in.weight

encoder.layer_norm_in.bias

encoder.positional_encoding.pe

encoder.layer_stack.0.slf_attn.w_qs.weight

encoder.layer_stack.0.slf_attn.w_qs.bias

encoder.layer_stack.0.slf_attn.w_ks.weight

encoder.layer_stack.0.slf_attn.w_ks.bias

encoder.layer_stack.0.slf_attn.w_vs.weight

encoder.layer_stack.0.slf_attn.w_vs.bias

encoder.layer_stack.0.slf_attn.layer_norm.weight

encoder.layer_stack.0.slf_attn.layer_norm.bias

encoder.layer_stack.0.slf_attn.fc.weight

encoder.layer_stack.0.slf_attn.fc.bias

encoder.layer_stack.0.pos_ffn.w_1.weight

encoder.layer_stack.0.pos_ffn.w_1.bias

encoder.layer_stack.0.pos_ffn.w_2.weight

encoder.layer_stack.0.pos_ffn.w_2.bias

encoder.layer_stack.0.pos_ffn.layer_norm.weight

encoder.layer_stack.0.pos_ffn.layer_norm.bias

encoder.layer_stack.1.slf_attn.w_qs.weight

encoder.layer_stack.1.slf_attn.w_qs.bias

encoder.layer_stack.1.slf_attn.w_ks.weight

encoder.layer_stack.1.slf_attn.w_ks.bias

encoder.layer_stack.1.slf_attn.w_vs.weight

encoder.layer_stack.1.slf_attn.w_vs.bias

encoder.layer_stack.1.slf_attn.layer_norm.weight

encoder.layer_stack.1.slf_attn.layer_norm.bias

encoder.layer_stack.1.slf_attn.fc.weight

encoder.layer_stack.1.slf_attn.fc.bias

encoder.layer_stack.1.pos_ffn.w_1.weight

encoder.layer_stack.1.pos_ffn.w_1.bias

encoder.layer_stack.1.pos_ffn.w_2.weight

encoder.layer_stack.1.pos_ffn.w_2.bias

encoder.layer_stack.1.pos_ffn.layer_norm.weight

encoder.layer_stack.1.pos_ffn.layer_norm.bias

encoder.layer_stack.2.slf_attn.w_qs.weight

encoder.layer_stack.2.slf_attn.w_qs.bias

encoder.layer_stack.2.slf_attn.w_ks.weight

encoder.layer_stack.2.slf_attn.w_ks.bias

encoder.layer_stack.2.slf_attn.w_vs.weight

encoder.layer_stack.2.slf_attn.w_vs.bias

encoder.layer_stack.2.slf_attn.layer_norm.weight

encoder.layer_stack.2.slf_attn.layer_norm.bias

encoder.layer_stack.2.slf_attn.fc.weight

encoder.layer_stack.2.slf_attn.fc.bias

encoder.layer_stack.2.pos_ffn.w_1.weight

encoder.layer_stack.2.pos_ffn.w_1.bias

encoder.layer_stack.2.pos_ffn.w_2.weight

encoder.layer_stack.2.pos_ffn.w_2.bias

encoder.layer_stack.2.pos_ffn.layer_norm.weight

encoder.layer_stack.2.pos_ffn.layer_norm.bias

encoder.layer_stack.3.slf_attn.w_qs.weight

encoder.layer_stack.3.slf_attn.w_qs.bias

encoder.layer_stack.3.slf_attn.w_ks.weight

encoder.layer_stack.3.slf_attn.w_ks.bias

encoder.layer_stack.3.slf_attn.w_vs.weight

encoder.layer_stack.3.slf_attn.w_vs.bias

encoder.layer_stack.3.slf_attn.layer_norm.weight

encoder.layer_stack.3.slf_attn.layer_norm.bias

encoder.layer_stack.3.slf_attn.fc.weight

encoder.layer_stack.3.slf_attn.fc.bias

encoder.layer_stack.3.pos_ffn.w_1.weight

encoder.layer_stack.3.pos_ffn.w_1.bias

encoder.layer_stack.3.pos_ffn.w_2.weight

encoder.layer_stack.3.pos_ffn.w_2.bias

encoder.layer_stack.3.pos_ffn.layer_norm.weight

encoder.layer_stack.3.pos_ffn.layer_norm.bias

encoder.layer_stack.4.slf_attn.w_qs.weight

encoder.layer_stack.4.slf_attn.w_qs.bias

encoder.layer_stack.4.slf_attn.w_ks.weight

encoder.layer_stack.4.slf_attn.w_ks.bias

encoder.layer_stack.4.slf_attn.w_vs.weight

encoder.layer_stack.4.slf_attn.w_vs.bias

encoder.layer_stack.4.slf_attn.layer_norm.weight

encoder.layer_stack.4.slf_attn.layer_norm.bias

encoder.layer_stack.4.slf_attn.fc.weight

encoder.layer_stack.4.slf_attn.fc.bias

encoder.layer_stack.4.pos_ffn.w_1.weight

encoder.layer_stack.4.pos_ffn.w_1.bias

encoder.layer_stack.4.pos_ffn.w_2.weight

encoder.layer_stack.4.pos_ffn.w_2.bias

encoder.layer_stack.4.pos_ffn.layer_norm.weight

encoder.layer_stack.4.pos_ffn.layer_norm.bias

encoder.layer_stack.5.slf_attn.w_qs.weight

encoder.layer_stack.5.slf_attn.w_qs.bias

encoder.layer_stack.5.slf_attn.w_ks.weight

encoder.layer_stack.5.slf_attn.w_ks.bias

encoder.layer_stack.5.slf_attn.w_vs.weight

encoder.layer_stack.5.slf_attn.w_vs.bias

encoder.layer_stack.5.slf_attn.layer_norm.weight

encoder.layer_stack.5.slf_attn.layer_norm.bias

encoder.layer_stack.5.slf_attn.fc.weight

encoder.layer_stack.5.slf_attn.fc.bias

encoder.layer_stack.5.pos_ffn.w_1.weight

encoder.layer_stack.5.pos_ffn.w_1.bias

encoder.layer_stack.5.pos_ffn.w_2.weight

encoder.layer_stack.5.pos_ffn.w_2.bias

encoder.layer_stack.5.pos_ffn.layer_norm.weight

encoder.layer_stack.5.pos_ffn.layer_norm.bias

decoder.tgt_word_emb.weight

decoder.positional_encoding.pe

decoder.layer_stack.0.slf_attn.w_qs.weight

decoder.layer_stack.0.slf_attn.w_qs.bias

decoder.layer_stack.0.slf_attn.w_ks.weight

decoder.layer_stack.0.slf_attn.w_ks.bias

decoder.layer_stack.0.slf_attn.w_vs.weight

decoder.layer_stack.0.slf_attn.w_vs.bias

decoder.layer_stack.0.slf_attn.layer_norm.weight

decoder.layer_stack.0.slf_attn.layer_norm.bias

decoder.layer_stack.0.slf_attn.fc.weight

decoder.layer_stack.0.slf_attn.fc.bias

decoder.layer_stack.0.enc_attn.w_qs.weight

decoder.layer_stack.0.enc_attn.w_qs.bias

decoder.layer_stack.0.enc_attn.w_ks.weight

decoder.layer_stack.0.enc_attn.w_ks.bias

decoder.layer_stack.0.enc_attn.w_vs.weight

decoder.layer_stack.0.enc_attn.w_vs.bias

decoder.layer_stack.0.enc_attn.layer_norm.weight

decoder.layer_stack.0.enc_attn.layer_norm.bias

decoder.layer_stack.0.enc_attn.fc.weight

decoder.layer_stack.0.enc_attn.fc.bias

decoder.layer_stack.0.pos_ffn.w_1.weight

decoder.layer_stack.0.pos_ffn.w_1.bias

decoder.layer_stack.0.pos_ffn.w_2.weight

decoder.layer_stack.0.pos_ffn.w_2.bias

decoder.layer_stack.0.pos_ffn.layer_norm.weight

decoder.layer_stack.0.pos_ffn.layer_norm.bias

decoder.layer_stack.1.slf_attn.w_qs.weight

decoder.layer_stack.1.slf_attn.w_qs.bias

decoder.layer_stack.1.slf_attn.w_ks.weight

decoder.layer_stack.1.slf_attn.w_ks.bias

decoder.layer_stack.1.slf_attn.w_vs.weight

decoder.layer_stack.1.slf_attn.w_vs.bias

decoder.layer_stack.1.slf_attn.layer_norm.weight

decoder.layer_stack.1.slf_attn.layer_norm.bias

decoder.layer_stack.1.slf_attn.fc.weight

decoder.layer_stack.1.slf_attn.fc.bias

decoder.layer_stack.1.enc_attn.w_qs.weight

decoder.layer_stack.1.enc_attn.w_qs.bias

decoder.layer_stack.1.enc_attn.w_ks.weight

decoder.layer_stack.1.enc_attn.w_ks.bias

decoder.layer_stack.1.enc_attn.w_vs.weight

decoder.layer_stack.1.enc_attn.w_vs.bias

decoder.layer_stack.1.enc_attn.layer_norm.weight

decoder.layer_stack.1.enc_attn.layer_norm.bias

decoder.layer_stack.1.enc_attn.fc.weight

decoder.layer_stack.1.enc_attn.fc.bias

decoder.layer_stack.1.pos_ffn.w_1.weight

decoder.layer_stack.1.pos_ffn.w_1.bias

decoder.layer_stack.1.pos_ffn.w_2.weight

decoder.layer_stack.1.pos_ffn.w_2.bias

decoder.layer_stack.1.pos_ffn.layer_norm.weight

decoder.layer_stack.1.pos_ffn.layer_norm.bias

decoder.layer_stack.2.slf_attn.w_qs.weight

decoder.layer_stack.2.slf_attn.w_qs.bias

decoder.layer_stack.2.slf_attn.w_ks.weight

decoder.layer_stack.2.slf_attn.w_ks.bias

decoder.layer_stack.2.slf_attn.w_vs.weight

decoder.layer_stack.2.slf_attn.w_vs.bias

decoder.layer_stack.2.slf_attn.layer_norm.weight

decoder.layer_stack.2.slf_attn.layer_norm.bias

decoder.layer_stack.2.slf_attn.fc.weight

decoder.layer_stack.2.slf_attn.fc.bias

decoder.layer_stack.2.enc_attn.w_qs.weight

decoder.layer_stack.2.enc_attn.w_qs.bias

decoder.layer_stack.2.enc_attn.w_ks.weight

decoder.layer_stack.2.enc_attn.w_ks.bias

decoder.layer_stack.2.enc_attn.w_vs.weight

decoder.layer_stack.2.enc_attn.w_vs.bias

decoder.layer_stack.2.enc_attn.layer_norm.weight

decoder.layer_stack.2.enc_attn.layer_norm.bias

decoder.layer_stack.2.enc_attn.fc.weight

decoder.layer_stack.2.enc_attn.fc.bias

decoder.layer_stack.2.pos_ffn.w_1.weight

decoder.layer_stack.2.pos_ffn.w_1.bias

decoder.layer_stack.2.pos_ffn.w_2.weight

decoder.layer_stack.2.pos_ffn.w_2.bias

decoder.layer_stack.2.pos_ffn.layer_norm.weight

decoder.layer_stack.2.pos_ffn.layer_norm.bias

decoder.layer_stack.3.slf_attn.w_qs.weight

decoder.layer_stack.3.slf_attn.w_qs.bias

decoder.layer_stack.3.slf_attn.w_ks.weight

decoder.layer_stack.3.slf_attn.w_ks.bias

decoder.layer_stack.3.slf_attn.w_vs.weight

decoder.layer_stack.3.slf_attn.w_vs.bias

decoder.layer_stack.3.slf_attn.layer_norm.weight

decoder.layer_stack.3.slf_attn.layer_norm.bias

decoder.layer_stack.3.slf_attn.fc.weight

decoder.layer_stack.3.slf_attn.fc.bias

decoder.layer_stack.3.enc_attn.w_qs.weight

decoder.layer_stack.3.enc_attn.w_qs.bias

decoder.layer_stack.3.enc_attn.w_ks.weight

decoder.layer_stack.3.enc_attn.w_ks.bias

decoder.layer_stack.3.enc_attn.w_vs.weight

decoder.layer_stack.3.enc_attn.w_vs.bias

decoder.layer_stack.3.enc_attn.layer_norm.weight

decoder.layer_stack.3.enc_attn.layer_norm.bias

decoder.layer_stack.3.enc_attn.fc.weight

decoder.layer_stack.3.enc_attn.fc.bias

decoder.layer_stack.3.pos_ffn.w_1.weight

decoder.layer_stack.3.pos_ffn.w_1.bias

decoder.layer_stack.3.pos_ffn.w_2.weight

decoder.layer_stack.3.pos_ffn.w_2.bias

decoder.layer_stack.3.pos_ffn.layer_norm.weight

decoder.layer_stack.3.pos_ffn.layer_norm.bias

decoder.layer_stack.4.slf_attn.w_qs.weight

decoder.layer_stack.4.slf_attn.w_qs.bias

decoder.layer_stack.4.slf_attn.w_ks.weight

decoder.layer_stack.4.slf_attn.w_ks.bias

decoder.layer_stack.4.slf_attn.w_vs.weight

decoder.layer_stack.4.slf_attn.w_vs.bias

decoder.layer_stack.4.slf_attn.layer_norm.weight

decoder.layer_stack.4.slf_attn.layer_norm.bias

decoder.layer_stack.4.slf_attn.fc.weight

decoder.layer_stack.4.slf_attn.fc.bias

decoder.layer_stack.4.enc_attn.w_qs.weight

decoder.layer_stack.4.enc_attn.w_qs.bias

decoder.layer_stack.4.enc_attn.w_ks.weight

decoder.layer_stack.4.enc_attn.w_ks.bias

decoder.layer_stack.4.enc_attn.w_vs.weight

decoder.layer_stack.4.enc_attn.w_vs.bias

decoder.layer_stack.4.enc_attn.layer_norm.weight

decoder.layer_stack.4.enc_attn.layer_norm.bias

decoder.layer_stack.4.enc_attn.fc.weight

decoder.layer_stack.4.enc_attn.fc.bias

decoder.layer_stack.4.pos_ffn.w_1.weight

decoder.layer_stack.4.pos_ffn.w_1.bias

decoder.layer_stack.4.pos_ffn.w_2.weight

decoder.layer_stack.4.pos_ffn.w_2.bias

decoder.layer_stack.4.pos_ffn.layer_norm.weight

decoder.layer_stack.4.pos_ffn.layer_norm.bias

decoder.layer_stack.5.slf_attn.w_qs.weight

decoder.layer_stack.5.slf_attn.w_qs.bias

decoder.layer_stack.5.slf_attn.w_ks.weight

decoder.layer_stack.5.slf_attn.w_ks.bias

decoder.layer_stack.5.slf_attn.w_vs.weight

decoder.layer_stack.5.slf_attn.w_vs.bias

decoder.layer_stack.5.slf_attn.layer_norm.weight

decoder.layer_stack.5.slf_attn.layer_norm.bias

decoder.layer_stack.5.slf_attn.fc.weight

decoder.layer_stack.5.slf_attn.fc.bias

decoder.layer_stack.5.enc_attn.w_qs.weight

decoder.layer_stack.5.enc_attn.w_qs.bias

decoder.layer_stack.5.enc_attn.w_ks.weight

decoder.layer_stack.5.enc_attn.w_ks.bias

decoder.layer_stack.5.enc_attn.w_vs.weight

decoder.layer_stack.5.enc_attn.w_vs.bias

decoder.layer_stack.5.enc_attn.layer_norm.weight

decoder.layer_stack.5.enc_attn.layer_norm.bias

decoder.layer_stack.5.enc_attn.fc.weight

decoder.layer_stack.5.enc_attn.fc.bias

decoder.layer_stack.5.pos_ffn.w_1.weight

decoder.layer_stack.5.pos_ffn.w_1.bias

decoder.layer_stack.5.pos_ffn.w_2.weight

decoder.layer_stack.5.pos_ffn.w_2.bias

decoder.layer_stack.5.pos_ffn.layer_norm.weight

decoder.layer_stack.5.pos_ffn.layer_norm.bias

decoder.tgt_word_prj.weight

你可能感兴趣的:(【模型参数结构】)