Layernorm反向梯度

\mu =\frac{1}{H}\sum_{i=1}^{H}x_{i}

\sigma ^{2}=\frac{1}{H}\sum_{i=1}^{H}\left ( x_{i}-\mu \right )^{2}

\hat{x}=\frac{x-\mu }{\sqrt{\sigma ^{2}+\varepsilon }}

y=g\odot \hat{x}+b     \odot表示element-wise乘法

则输入的梯度为:

\begin{align*} \frac{\partial L}{\partial x_{i}} &=\sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot \frac{\partial y_{j}}{\partial \hat{x}_{j}}\cdot \frac{\partial \hat{x}_{j}}{\partial x_{i}} \\ &= \sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot \frac{\partial \hat{x}_{j}}{\partial x_{i}} \end{align*}

\begin{align*}\frac{\partial \hat{x_{j}}}{\partial x_{i}} &= \frac{\partial }{\partial x_{i}}\left ( \frac{x_{j}-\mu_{j} }{\sqrt{\sigma_{j} ^{2}-\varepsilon }} \right )\\ &=\frac{1}{\sqrt{\sigma_{j} ^{2}+\varepsilon }}\frac{\partial x_{j}}{\partial x_{i}}-\frac{1}{\sqrt{\sigma_{j} ^{2}+\varepsilon }}\frac{\partial \mu_{j} }{\partial x_{i}}\\ &+\left ( x_{j}-\mu_{j} \right )\cdot \left ( -\frac{1}{2} \right )\cdot \left ( \sigma_{j} ^{2}+\varepsilon \right )^{-\frac{3}{2}}\frac{\partial \sigma_{j} ^{2}}{\partial x_{i}} \end{align*}

当j=i时,

\frac{\partial x_{i}}{\partial x_{i}} =1

j≠i时,

 \frac{\partial x_{j}}{\partial x_{i}} =0

\frac{\partial \mu_{j} }{\partial x_{i}} = \frac{\partial}{\partial x_{i}}\left ( \frac{1}{H}\sum_{k=1}^{H}x_{k} \right )_{j}

虽然是求和,但是只有k=i时有值,所以:

\frac{\partial \mu_{j} }{\partial x_{i}} = \frac{1}{H}

同样地,虽然是求和,但是只有k=i时有值

\begin{align*}\frac{\partial \sigma_{j} ^{2}}{\partial x_{i}} &= \frac{\partial}{\partial x_{i}}\left [ \frac{1}{H}\sum_{k=1}^{H}\left ( x_{k}-\mu_{j} \right )^{2} \right ]\\ &=\frac{2}{H}\left ( x_{i}-\mu_{j} \right )+\frac{1}{H}\sum_{k=1}^{H}2\cdot \left ( x_{k}-\mu_{j} \right )\left (-\frac{\partial \mu_{j} }{\partial x_{i}} \right ) \\ &= \frac{2}{H}\left ( x_{i}-\mu_{j} \right )-\frac{1}{H}\sum_{k=1}^{H}2\cdot \left ( x_{k}-\mu_{j} \right ) \frac{\partial \mu_{j} }{\partial x_{i}} \end{align*}

所以:

\begin{align*} \frac{\partial L}{\partial x_{i}} &= \sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot \frac{1}{\sqrt{\sigma ^{2}+\varepsilon }}\cdot \frac{\partial x_{j}}{\partial x_{i}}\\ &-\sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot \frac{1}{\sqrt{\sigma ^{2}+\varepsilon }}\cdot \frac{1}{H} \\ &+\sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot\left ( x_{j}-\mu_{j} \right )\cdot \left ( -\frac{1}{2} \right ) \left (\sigma_{j} ^{2}+\varepsilon \right ) ^{-\frac{3}{2}}\cdot\frac{2}{H}\left ( x_{i}-\mu _{j}\right )\\&-\frac{1}{H}\sum_{k=1}^{H}\left [\sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot\left ( x_{j}-\mu_{j} \right )\cdot \left ( -\frac{1}{2} \right ) \left (\sigma_{j} ^{2}+\varepsilon \right ) ^{-\frac{3}{2}} \right ]\frac{2}{H}\left ( x_{k}-\mu_{j} \right ) \end{align*}

\begin{align*} \frac{\partial L}{\partial x_{i}} &= \left ( \sigma ^{2}+\varepsilon \right )^{-\frac{1}{2}}\frac{\partial L}{\partial y_{i}}\cdot g_{i}\\ &-\left ( \sigma ^{2}+\varepsilon \right )^{-\frac{1}{2}}\cdot \frac{1}{H} \cdot \sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j} \\ &-\left ( \sigma ^{2}+\varepsilon \right )^{-\frac{1}{2}}\cdot \frac{1}{H}\cdot \left ( x_{i}-\mu \right )\cdot \sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot\frac{ x_{j}-\mu}{\sigma ^{2}+\varepsilon} \\&+\left ( \sigma ^{2}+\varepsilon \right )^{-\frac{1}{2}}\cdot\frac{1}{H}\cdot \sum_{k=1}^{H}\frac{1}{H}\cdot \left ( x_{k}-\mu \right )\cdot \left [\sum_{j=1}^{H}\frac{\partial L}{\partial y_{j}}\cdot g_{j}\cdot \frac{x_{j}-\mu}{\sigma ^{2}+\varepsilon} \right ] \end{align*}

你可能感兴趣的:(深度学习)