def model_info(model):
n_p = sum(x.numel() for x in model.parameters())
n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)
print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
for i, (name, p) in enumerate(model.named_parameters()):
name = name.replace('module_list.', '')
print('%5g %50s %9s %12g %20s %12.3g %12.3g' % (
i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g))
layer name gradient parameters shape mu sigma
0 masked_spec_embed True 768 [768] 0.157 0.181
1 feature_extractor.conv_layers.0.conv.weight True 5120 [512, 1, 10] -0.000329 0.131
2 feature_extractor.conv_layers.0.layer_norm.weight True 512 [512] 0.171 0.273
3 feature_extractor.conv_layers.0.layer_norm.bias True 512 [512] -7.12e-05 0.00285
4 feature_extractor.conv_layers.1.conv.weight True 786432 [512, 512, 3] -0.0115 0.0739
5 feature_extractor.conv_layers.2.conv.weight True 786432 [512, 512, 3] 0.00327 0.0743
6 feature_extractor.conv_layers.3.conv.weight True 786432 [512, 512, 3] 0.00151 0.0614
7 feature_extractor.conv_layers.4.conv.weight True 786432 [512, 512, 3] 0.0015 0.0546
8 feature_extractor.conv_layers.5.conv.weight True 524288 [512, 512, 2] 0.000704 0.0278
9 feature_extractor.conv_layers.6.conv.weight True 524288 [512, 512, 2] -3.92e-05 0.0121
10 feature_projection.layer_norm.weight True 512 [512] 0.0661 0.0889
11 feature_projection.layer_norm.bias True 512 [512] 0.0062 0.0364
12 feature_projection.projection.weight True 393216 [768, 512] -2.67e-05 0.064
13 feature_projection.projection.bias True 768 [768] -0.00662 0.0885
14 encoder.pos_conv_embed.conv.bias True 768 [768] 0.164 0.101
15 encoder.pos_conv_embed.conv.weight_g True 128 [1, 1, 128] 0.413 1.21
16 encoder.pos_conv_embed.conv.weight_v True 4.71859e+06 [768, 48, 128] 0.0108 0.0697
17 encoder.layer_norm.weight True 768 [768] 0.235 0.0622
18 encoder.layer_norm.bias True 768 [768] 0.00208 0.0479
19 encoder.layers.0.attention.k_proj.weight True 589824 [768, 768] -5.36e-06 0.0775
20 encoder.layers.0.attention.k_proj.bias True 768 [768] 6.14e-06 0.000761
21 encoder.layers.0.attention.v_proj.weight True 589824 [768, 768] -3.31e-05 0.0292
22 encoder.layers.0.attention.v_proj.bias True 768 [768] -8.7e-05 0.00806
23 encoder.layers.0.attention.q_proj.weight True 589824 [768, 768] 0.000193 0.0755
24 encoder.layers.0.attention.q_proj.bias True 768 [768] 0.00174 0.206
25 encoder.layers.0.attention.out_proj.weight True 589824 [768, 768] -2.14e-05 0.0274
26 encoder.layers.0.attention.out_proj.bias True 768 [768] 0.000558 0.012
27 encoder.layers.0.layer_norm.weight True 768 [768] 0.216 0.0821
28 encoder.layers.0.layer_norm.bias True 768 [768] 0.00546 0.14
29 encoder.layers.0.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] -0.00206 0.0873
30 encoder.layers.0.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0697 0.0333
31 encoder.layers.0.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] 1.42e-05 0.0801
32 encoder.layers.0.feed_forward.output_dense.bias True 768 [768] 7.4e-05 0.0482
33 encoder.layers.0.final_layer_norm.weight True 768 [768] 0.223 0.0773
34 encoder.layers.0.final_layer_norm.bias True 768 [768] -0.00179 0.0367
35 encoder.layers.1.attention.k_proj.weight True 589824 [768, 768] 6.55e-05 0.0885
36 encoder.layers.1.attention.k_proj.bias True 768 [768] 2.53e-05 0.00099
37 encoder.layers.1.attention.v_proj.weight True 589824 [768, 768] -5.96e-08 0.0346
38 encoder.layers.1.attention.v_proj.bias True 768 [768] -9.4e-05 0.0115
39 encoder.layers.1.attention.q_proj.weight True 589824 [768, 768] 1.37e-06 0.0859
40 encoder.layers.1.attention.q_proj.bias True 768 [768] 0.000253 0.232
41 encoder.layers.1.attention.out_proj.weight True 589824 [768, 768] -1.32e-05 0.0357
42 encoder.layers.1.attention.out_proj.bias True 768 [768] -0.00039 0.0116
43 encoder.layers.1.layer_norm.weight True 768 [768] 0.258 0.0585
44 encoder.layers.1.layer_norm.bias True 768 [768] -0.00402 0.222
45 encoder.layers.1.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 0.000182 0.0748
46 encoder.layers.1.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0264 0.022
47 encoder.layers.1.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] -1.21e-05 0.0751
48 encoder.layers.1.feed_forward.output_dense.bias True 768 [768] -0.000961 0.0447
49 encoder.layers.1.final_layer_norm.weight True 768 [768] 0.23 0.111
50 encoder.layers.1.final_layer_norm.bias True 768 [768] -0.000677 0.0528
51 encoder.layers.2.attention.k_proj.weight True 589824 [768, 768] -6.86e-05 0.0992
52 encoder.layers.2.attention.k_proj.bias True 768 [768] -0.000327 0.00937
53 encoder.layers.2.attention.v_proj.weight True 589824 [768, 768] -1.21e-05 0.0449
54 encoder.layers.2.attention.v_proj.bias True 768 [768] -0.00103 0.0218
55 encoder.layers.2.attention.q_proj.weight True 589824 [768, 768] -8.58e-06 0.0974
56 encoder.layers.2.attention.q_proj.bias True 768 [768] 0.0175 0.223
57 encoder.layers.2.attention.out_proj.weight True 589824 [768, 768] -4.12e-05 0.0527
58 encoder.layers.2.attention.out_proj.bias True 768 [768] 0.00016 0.0389
59 encoder.layers.2.layer_norm.weight True 768 [768] 0.258 0.09
60 encoder.layers.2.layer_norm.bias True 768 [768] -0.0118 0.235
61 encoder.layers.2.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 0.000607 0.0685
62 encoder.layers.2.feed_forward.intermediate_dense.bias True 3072 [3072] -0.011 0.0197
63 encoder.layers.2.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] -1.16e-05 0.0753
64 encoder.layers.2.feed_forward.output_dense.bias True 768 [768] 0.00127 0.0426
65 encoder.layers.2.final_layer_norm.weight True 768 [768] 0.225 0.122
66 encoder.layers.2.final_layer_norm.bias True 768 [768] 0.00127 0.0593
67 encoder.layers.3.attention.k_proj.weight True 589824 [768, 768] 8.05e-05 0.1
68 encoder.layers.3.attention.k_proj.bias True 768 [768] -0.000279 0.0136
69 encoder.layers.3.attention.v_proj.weight True 589824 [768, 768] -2.31e-05 0.0499
70 encoder.layers.3.attention.v_proj.bias True 768 [768] 8.39e-05 0.0223
71 encoder.layers.3.attention.q_proj.weight True 589824 [768, 768] 1.8e-05 0.0986
72 encoder.layers.3.attention.q_proj.bias True 768 [768] -0.00636 0.218
73 encoder.layers.3.attention.out_proj.weight True 589824 [768, 768] -2.86e-05 0.0576
74 encoder.layers.3.attention.out_proj.bias True 768 [768] 0.00205 0.0389
75 encoder.layers.3.layer_norm.weight True 768 [768] 0.255 0.11
76 encoder.layers.3.layer_norm.bias True 768 [768] -0.0176 0.192
77 encoder.layers.3.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 0.00107 0.0739
78 encoder.layers.3.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0166 0.0188
79 encoder.layers.3.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] 8.76e-06 0.0784
80 encoder.layers.3.feed_forward.output_dense.bias True 768 [768] -0.00051 0.0531
81 encoder.layers.3.final_layer_norm.weight True 768 [768] 0.229 0.124
82 encoder.layers.3.final_layer_norm.bias True 768 [768] 0.0021 0.0558
83 encoder.layers.4.attention.k_proj.weight True 589824 [768, 768] -4.17e-07 0.0995
84 encoder.layers.4.attention.k_proj.bias True 768 [768] -4.83e-05 0.0132
85 encoder.layers.4.attention.v_proj.weight True 589824 [768, 768] -0.000119 0.054
86 encoder.layers.4.attention.v_proj.bias True 768 [768] -0.000524 0.0226
87 encoder.layers.4.attention.q_proj.weight True 589824 [768, 768] 0.000148 0.0986
88 encoder.layers.4.attention.q_proj.bias True 768 [768] 0.00137 0.21
89 encoder.layers.4.attention.out_proj.weight True 589824 [768, 768] 0.000232 0.0593
90 encoder.layers.4.attention.out_proj.bias True 768 [768] 0.00272 0.0331
91 encoder.layers.4.layer_norm.weight True 768 [768] 0.261 0.119
92 encoder.layers.4.layer_norm.bias True 768 [768] -0.0064 0.161
93 encoder.layers.4.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 0.000638 0.0836
94 encoder.layers.4.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0254 0.021
95 encoder.layers.4.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] 9.26e-05 0.0796
96 encoder.layers.4.feed_forward.output_dense.bias True 768 [768] -0.00357 0.0703
97 encoder.layers.4.final_layer_norm.weight True 768 [768] 0.234 0.129
98 encoder.layers.4.final_layer_norm.bias True 768 [768] -3.08e-05 0.0477
99 encoder.layers.5.attention.k_proj.weight True 589824 [768, 768] -7.06e-05 0.103
100 encoder.layers.5.attention.k_proj.bias True 768 [768] 0.000275 0.0147
101 encoder.layers.5.attention.v_proj.weight True 589824 [768, 768] -6.85e-06 0.0573
102 encoder.layers.5.attention.v_proj.bias True 768 [768] 0.00131 0.0245
103 encoder.layers.5.attention.q_proj.weight True 589824 [768, 768] -2.87e-05 0.102
104 encoder.layers.5.attention.q_proj.bias True 768 [768] -0.00252 0.224
105 encoder.layers.5.attention.out_proj.weight True 589824 [768, 768] 8.57e-05 0.0584
106 encoder.layers.5.attention.out_proj.bias True 768 [768] 0.00103 0.0413
107 encoder.layers.5.layer_norm.weight True 768 [768] 0.265 0.117
108 encoder.layers.5.layer_norm.bias True 768 [768] -0.00187 0.153
109 encoder.layers.5.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 0.000599 0.0862
110 encoder.layers.5.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0287 0.0255
111 encoder.layers.5.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] 8.17e-05 0.0793
112 encoder.layers.5.feed_forward.output_dense.bias True 768 [768] -0.00293 0.084
113 encoder.layers.5.final_layer_norm.weight True 768 [768] 0.239 0.139
114 encoder.layers.5.final_layer_norm.bias True 768 [768] -0.00332 0.0482
115 encoder.layers.6.attention.k_proj.weight True 589824 [768, 768] 6.85e-06 0.109
116 encoder.layers.6.attention.k_proj.bias True 768 [768] -1.12e-05 0.0208
117 encoder.layers.6.attention.v_proj.weight True 589824 [768, 768] -4.18e-05 0.0613
118 encoder.layers.6.attention.v_proj.bias True 768 [768] -5.96e-08 0.0321
119 encoder.layers.6.attention.q_proj.weight True 589824 [768, 768] 0.000114 0.11
120 encoder.layers.6.attention.q_proj.bias True 768 [768] 0.00426 0.246
121 encoder.layers.6.attention.out_proj.weight True 589824 [768, 768] -6.79e-05 0.0636
122 encoder.layers.6.attention.out_proj.bias True 768 [768] 0.00273 0.0557
123 encoder.layers.6.layer_norm.weight True 768 [768] 0.272 0.144
124 encoder.layers.6.layer_norm.bias True 768 [768] 0.002 0.154
125 encoder.layers.6.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 0.000262 0.0887
126 encoder.layers.6.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0326 0.027
127 encoder.layers.6.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] 7.65e-05 0.0806
128 encoder.layers.6.feed_forward.output_dense.bias True 768 [768] -0.00427 0.0779
129 encoder.layers.6.final_layer_norm.weight True 768 [768] 0.233 0.135
130 encoder.layers.6.final_layer_norm.bias True 768 [768] 0.00141 0.0457
131 encoder.layers.7.attention.k_proj.weight True 589824 [768, 768] -4.63e-05 0.107
132 encoder.layers.7.attention.k_proj.bias True 768 [768] -0.00101 0.029
133 encoder.layers.7.attention.v_proj.weight True 589824 [768, 768] 4.66e-05 0.0656
134 encoder.layers.7.attention.v_proj.bias True 768 [768] 0.000915 0.0385
135 encoder.layers.7.attention.q_proj.weight True 589824 [768, 768] 2.57e-05 0.11
136 encoder.layers.7.attention.q_proj.bias True 768 [768] -0.0104 0.288
137 encoder.layers.7.attention.out_proj.weight True 589824 [768, 768] -2.15e-06 0.0695
138 encoder.layers.7.attention.out_proj.bias True 768 [768] 0.00116 0.065
139 encoder.layers.7.layer_norm.weight True 768 [768] 0.331 0.173
140 encoder.layers.7.layer_norm.bias True 768 [768] 0.00152 0.187
141 encoder.layers.7.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] 1.28e-05 0.0919
142 encoder.layers.7.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0281 0.028
143 encoder.layers.7.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] -0.000171 0.0869
144 encoder.layers.7.feed_forward.output_dense.bias True 768 [768] 0.000748 0.06
145 encoder.layers.7.final_layer_norm.weight True 768 [768] 0.239 0.0971
146 encoder.layers.7.final_layer_norm.bias True 768 [768] 0.0107 0.0452
147 encoder.layers.8.attention.k_proj.weight True 589824 [768, 768] 0.000248 0.097
148 encoder.layers.8.attention.k_proj.bias True 768 [768] -0.00115 0.0342
149 encoder.layers.8.attention.v_proj.weight True 589824 [768, 768] -0.00012 0.067
150 encoder.layers.8.attention.v_proj.bias True 768 [768] -0.000697 0.0251
151 encoder.layers.8.attention.q_proj.weight True 589824 [768, 768] -0.000623 0.0947
152 encoder.layers.8.attention.q_proj.bias True 768 [768] -0.0194 0.37
153 encoder.layers.8.attention.out_proj.weight True 589824 [768, 768] -5.51e-05 0.0668
154 encoder.layers.8.attention.out_proj.bias True 768 [768] 0.000147 0.0273
155 encoder.layers.8.layer_norm.weight True 768 [768] 0.382 0.135
156 encoder.layers.8.layer_norm.bias True 768 [768] 0.0116 0.183
157 encoder.layers.8.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] -0.000806 0.0948
158 encoder.layers.8.feed_forward.intermediate_dense.bias True 3072 [3072] -0.034 0.0372
159 encoder.layers.8.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] -0.000215 0.0906
160 encoder.layers.8.feed_forward.output_dense.bias True 768 [768] 0.000484 0.0594
161 encoder.layers.8.final_layer_norm.weight True 768 [768] 0.249 0.0863
162 encoder.layers.8.final_layer_norm.bias True 768 [768] 0.0137 0.0431
163 encoder.layers.9.attention.k_proj.weight True 589824 [768, 768] 0.000131 0.0997
164 encoder.layers.9.attention.k_proj.bias True 768 [768] 0.00228 0.0609
165 encoder.layers.9.attention.v_proj.weight True 589824 [768, 768] 3.81e-06 0.0687
166 encoder.layers.9.attention.v_proj.bias True 768 [768] 0.000367 0.0243
167 encoder.layers.9.attention.q_proj.weight True 589824 [768, 768] -0.00083 0.0971
168 encoder.layers.9.attention.q_proj.bias True 768 [768] -0.0193 0.387
169 encoder.layers.9.attention.out_proj.weight True 589824 [768, 768] 9.18e-06 0.0662
170 encoder.layers.9.attention.out_proj.bias True 768 [768] -0.000442 0.0258
171 encoder.layers.9.layer_norm.weight True 768 [768] 0.384 0.121
172 encoder.layers.9.layer_norm.bias True 768 [768] 0.0104 0.154
173 encoder.layers.9.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] -0.000376 0.0916
174 encoder.layers.9.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0458 0.0424
175 encoder.layers.9.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] -0.000207 0.0867
176 encoder.layers.9.feed_forward.output_dense.bias True 768 [768] -4.89e-05 0.0767
177 encoder.layers.9.final_layer_norm.weight True 768 [768] 0.267 0.0851
178 encoder.layers.9.final_layer_norm.bias True 768 [768] 0.0141 0.0424
179 encoder.layers.10.attention.k_proj.weight True 589824 [768, 768] -0.000166 0.102
180 encoder.layers.10.attention.k_proj.bias True 768 [768] 0.00567 0.197
181 encoder.layers.10.attention.v_proj.weight True 589824 [768, 768] -9.74e-05 0.0701
182 encoder.layers.10.attention.v_proj.bias True 768 [768] -3.96e-05 0.0279
183 encoder.layers.10.attention.q_proj.weight True 589824 [768, 768] 0.000499 0.0995
184 encoder.layers.10.attention.q_proj.bias True 768 [768] 0.0118 0.397
185 encoder.layers.10.attention.out_proj.weight True 589824 [768, 768] -3.64e-05 0.0627
186 encoder.layers.10.attention.out_proj.bias True 768 [768] 0.000674 0.034
187 encoder.layers.10.layer_norm.weight True 768 [768] 0.273 0.108
188 encoder.layers.10.layer_norm.bias True 768 [768] 0.0186 0.115
189 encoder.layers.10.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] -0.00206 0.0864
190 encoder.layers.10.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0515 0.0597
191 encoder.layers.10.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] -0.000139 0.0803
192 encoder.layers.10.feed_forward.output_dense.bias True 768 [768] -0.00299 0.0764
193 encoder.layers.10.final_layer_norm.weight True 768 [768] 0.287 0.0898
194 encoder.layers.10.final_layer_norm.bias True 768 [768] 0.00797 0.0608
195 encoder.layers.11.attention.k_proj.weight True 589824 [768, 768] 0.000238 0.0995
196 encoder.layers.11.attention.k_proj.bias True 768 [768] -0.00132 0.126
197 encoder.layers.11.attention.v_proj.weight True 589824 [768, 768] 0.000263 0.055
198 encoder.layers.11.attention.v_proj.bias True 768 [768] 0.000306 0.0411
199 encoder.layers.11.attention.q_proj.weight True 589824 [768, 768] -0.000443 0.0971
200 encoder.layers.11.attention.q_proj.bias True 768 [768] -0.032 0.399
201 encoder.layers.11.attention.out_proj.weight True 589824 [768, 768] -1.29e-05 0.0497
202 encoder.layers.11.attention.out_proj.bias True 768 [768] 0.000419 0.0539
203 encoder.layers.11.layer_norm.weight True 768 [768] 0.292 0.092
204 encoder.layers.11.layer_norm.bias True 768 [768] 0.0268 0.159
205 encoder.layers.11.feed_forward.intermediate_dense.weight True 2.3593e+06 [3072, 768] -0.00429 0.0738
206 encoder.layers.11.feed_forward.intermediate_dense.bias True 3072 [3072] -0.0948 0.0643
207 encoder.layers.11.feed_forward.output_dense.weight True 2.3593e+06 [768, 3072] 5.78e-06 0.0471
208 encoder.layers.11.feed_forward.output_dense.bias True 768 [768] 0.000294 0.0265
209 encoder.layers.11.final_layer_norm.weight True 768 [768] 0.184 0.0941
210 encoder.layers.11.final_layer_norm.bias True 768 [768] -0.000411 0.0287
Model Summary: 211 layers, 9.43717e+07 parameters, 9.43717e+07 gradients