import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
%matplotlib inline
plt.rcParams['figure.figsize'] = (7, 3.5)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['axes.unicode_minus'] = False #解决坐标轴负数的铅显示问题
线性整流函数 (rectified linear unit)
relu = max ( 0 , x ) = { x , x > 0 0 , x ≤ 0 \text{relu} = \max(0, x) = \begin{cases} x, &x>0 \\ 0, &x\leq 0 \end{cases} relu=max(0,x)={x,0,x>0x≤0
f ( x ) 是 连 续 的 f(x)是连续的 f(x)是连续的
f ′ ( x ) = lim h → 0 f ( 0 ) = f ( 0 + h ) − f ( 0 ) h = max ( 0 , h ) − 0 h f'(x)=\lim_{h\to 0}f(0) = \frac{f(0 + h)-f(0)}{h}=\frac{\max(0, h) - 0}{h} f′(x)=limh→0f(0)=hf(0+h)−f(0)=hmax(0,h)−0
lim h → 0 − = 0 h = 0 \lim_{h\to0^-}=\frac{0}{h} = 0 limh→0−=h0=0
lim h → 0 + = h h = 1 \lim_{h\to0^+}=\frac{h}{h} = 1 limh→0+=hh=1
所以 f ′ ( 0 ) f'(0) f′(0)处不可导
所以 f ′ ( x ) = { 1 , x > 0 0 , x < 0 f'(x) = \begin{cases} 1, & x > 0 \\ 0, & x < 0 \end{cases} f′(x)={1,0,x>0x<0
ReLU激活函数是一个简单的计算,如果输入大于0,直接返回作为输入提供的值;如果输入是0或更小,返回值0。
class SelfDefinedRelu(torch.autograd.Function):
@staticmethod
def forward(ctx, inp):
ctx.save_for_backward(inp)
return torch.where(inp < 0., torch.zeros_like(inp), inp)
@staticmethod
def backward(ctx, grad_output):
inp, = ctx.saved_tensors
return grad_output * torch.where(inp < 0., torch.zeros_like(inp),
torch.ones_like(inp))
class Relu(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
out = SelfDefinedRelu.apply(x)
return out
# self defined
torch.manual_seed(0)
relu = Relu() # SelfDefinedRelu
inp = torch.randn(5, requires_grad=True)
out = relu((inp).pow(3))
print(f'Out is\n{out}')
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nFirst call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
Out is
tensor([3.6594, 0.0000, 0.0000, 0.1837, 0.0000],
grad_fn=)
First call
tensor([7.1240, 0.0000, 0.0000, 0.9693, 0.0000])
Second call
tensor([14.2480, 0.0000, 0.0000, 1.9387, 0.0000])
Call after zeroing gradients
tensor([7.1240, 0.0000, 0.0000, 0.9693, 0.0000])
# torch defined
torch.manual_seed(0)
inp = torch.randn(5, requires_grad=True)
out = torch.relu((inp).pow(3))
print(f'Out is\n{out}')
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nFirst call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
Out is
tensor([3.6594, 0.0000, 0.0000, 0.1837, 0.0000], grad_fn=)
First call
tensor([7.1240, 0.0000, 0.0000, 0.9693, 0.0000])
Second call
tensor([14.2480, 0.0000, 0.0000, 1.9387, 0.0000])
Call after zeroing gradients
tensor([7.1240, 0.0000, 0.0000, 0.9693, 0.0000])
# visualization
inp = torch.arange(-8, 8, 0.05, requires_grad=True)
out = relu(inp)
out.sum().backward()
inp_grad = inp.grad
plt.plot(inp.detach().numpy(),
out.detach().numpy(),
label=r"$relu(x)$",
alpha=0.7)
plt.plot(inp.detach().numpy(),
inp_grad.numpy(),
label=r"$relu'(x)$",
alpha=0.5)
plt.scatter(0, 0, color='None', marker='o', edgecolors='r', s=50)
plt.grid()
plt.legend()
plt.show()
leaky_relu = max ( α x , x ) = { x , x ≥ 0 α , x < 0 , α ∈ [ 0 , + ∞ ) \text{leaky\_relu} = \max(\alpha x, x) = \begin{cases} x, & x \ge 0 \\ \alpha, & x < 0 \end{cases} \quad, \alpha \in [0, + \infty) leaky_relu=max(αx,x)={x,α,x≥0x<0,α∈[0,+∞)
while α = 0 , leaky_relu = relu \text{while} \quad \alpha = 0, \text{leaky\_relu} = \text{relu} whileα=0,leaky_relu=relu
所以 f ′ ( x ) = { 1 , x ≥ 0 α , x < 0 f'(x) = \begin{cases} 1, & x \ge 0 \\ \alpha, & x < 0 \end{cases} f′(x)={1,α,x≥0x<0
超参数 α \alpha α 的取值也已经被很多实验研究过,有一种取值方法是对 α \alpha α 随机取值, α \alpha α 的分布满足均值为0,标准差为1的正态分布,该方法叫做随机LeakyReLU(Randomized LeakyReLU)。原论文指出随机LeakyReLU相比LeakyReLU能得更好的结果,且给出了参数 α \alpha α 的经验值1/5.5(好于0.01)。至于为什么随机LeakyReLU能取得更好的结果,解释之一就是随机LeakyReLU小于0部分的随机梯度,为优化方法引入了随机性,这些随机噪声可以帮助参数取值跳出局部最优和鞍点,这部分内容可能需要一整篇文章来阐述。正是由于 α \alpha α 的取值至关重要,人们不满足与随机取样 α \alpha α ,有论文将 α \alpha α 作为了需要学习的参数,该激活函数为 PReLU(Parametrized ReLU)
class SelfDefinedLeakyRelu(torch.autograd.Function):
@staticmethod
def forward(ctx, inp, alpha):
ctx.constant = alpha
ctx.save_for_backward(inp)
return torch.where(inp < 0., alpha * inp, inp)
@staticmethod
def backward(ctx, grad_output):
inp, = ctx.saved_tensors
ones_like_inp = torch.ones_like(inp)
return torch.where(inp < 0., ones_like_inp * ctx.constant,
ones_like_inp), None
class LeakyRelu(nn.Module):
def __init__(self, alpha=1):
super().__init__()
self.alpha = alpha
def forward(self, x):
out = SelfDefinedLeakyRelu.apply(x, self.alpha)
return out
# self defined
torch.manual_seed(0)
alpha = 0.1 # greater so could have bettrer visualization
leaky_relu = LeakyRelu(alpha=alpha) # SelfDefinedLeakyRelu
inp = torch.randn(5, requires_grad=True)
out = leaky_relu((inp).pow(3))
print(f'Out is\n{out}')
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nFirst call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
Out is
tensor([ 3.6594e+00, -2.5264e-03, -1.0343e+00, 1.8367e-01, -1.2756e-01],
grad_fn=)
First call
tensor([7.1240, 0.0258, 1.4241, 0.9693, 0.3529])
Second call
tensor([14.2480, 0.0517, 2.8483, 1.9387, 0.7057])
Call after zeroing gradients
tensor([7.1240, 0.0258, 1.4241, 0.9693, 0.3529])
# torch defined
torch.manual_seed(0)
inp = torch.randn(5, requires_grad=True)
out = F.leaky_relu((inp).pow(3), negative_slope=alpha)
print(f'Out is\n{out}')
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nFirst call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
Out is
tensor([ 3.6594e+00, -2.5264e-03, -1.0343e+00, 1.8367e-01, -1.2756e-01],
grad_fn=)
First call
tensor([7.1240, 0.0258, 1.4241, 0.9693, 0.3529])
Second call
tensor([14.2480, 0.0517, 2.8483, 1.9387, 0.7057])
Call after zeroing gradients
tensor([7.1240, 0.0258, 1.4241, 0.9693, 0.3529])
# visualization
inp = torch.arange(-8, 8, 0.05, requires_grad=True)
out = leaky_relu(inp)
out.sum().backward()
inp_grad = inp.grad
plt.plot(inp.detach().numpy(),
out.detach().numpy(),
label=r"$leakyrelu(x)$",
alpha=0.7)
plt.plot(inp.detach().numpy(),
inp_grad.numpy(),
label=r"$leakyrelu'(x)$",
alpha=0.5)
plt.scatter(0, 0, color='None', marker='o', edgecolors='r', s=50)
plt.grid()
plt.legend()
plt.show()
class SelfDefinedPRelu(torch.autograd.Function):
@staticmethod
def forward(ctx, inp, alpha):
ctx.constant = alpha
ctx.save_for_backward(inp)
return torch.where(inp < 0., alpha * inp, inp)
@staticmethod
def backward(ctx, grad_output):
inp, = ctx.saved_tensors
ones_like_inp = torch.ones_like(inp)
return torch.where(inp < 0., ones_like_inp * ctx.constant,
ones_like_inp), None
class PRelu(nn.Module):
def __init__(self):
super().__init__()
self.alpha = torch.randn(1, dtype=torch.float32, requires_grad=True)
def forward(self, x):
out = SelfDefinedLeakyRelu.apply(x, self.alpha)
return out
指数线性单元 (Exponential Linear Unit)
elu ( x ) = { x , x ≥ 0 α ( e x − 1 ) , x < 0 \text{elu}(x) = \begin{cases} x, & x \ge 0 \\ \alpha(e^x - 1), & x < 0 \end{cases} elu(x)={x,α(ex−1),x≥0x<0
f ′ ( x ) = lim h → 0 f ( 0 ) = f ( 0 + h ) − f ( 0 ) h f'(x)=\lim_{h\to 0}f(0) = \frac{f(0+h)-f(0)}{h} f′(x)=limh→0f(0)=hf(0+h)−f(0)
lim h → 0 − = α ( e h − 1 ) − 0 h = 0 \lim_{h\to0^-}=\frac{\alpha (e^h - 1) - 0}{h} = 0 limh→0−=hα(eh−1)−0=0
lim h → 0 + = h h = 1 \lim_{h\to0^+}=\frac{h}{h} = 1 limh→0+=hh=1
所以 f ′ ( 0 ) f'(0) f′(0)处不可导
所以 f ′ ( x ) = { 1 , x ≥ 0 α e x , x < 0 f'(x) = \begin{cases} 1, & x \ge 0 \\ \alpha e^x, & x < 0 \end{cases} f′(x)={1,αex,x≥0x<0
理想的激活函数应满足两个条件:
LeakyReLU和PReLU满足第1个条件,不满足第2个条件;而ReLU满足第2个条件,不满足第1个条件。两个条件都满足的激活函数为ELU(Exponential Linear Unit)。ELU虽然也不是零均值的,但在以0为中心一个较小的范围内,均值是趋向于0,当然也与 α \alpha α的取值也是相关的。
class SelfDefinedElu(torch.autograd.Function):
@staticmethod
def forward(ctx, inp, alpha):
ctx.constant = alpha * inp.exp()
ctx.save_for_backward(inp)
return torch.where(inp < 0., ctx.constant - alpha, inp)
@staticmethod
def backward(ctx, grad_output):
inp, = ctx.saved_tensors
ones_like_inp = torch.ones_like(inp)
return torch.where(inp < 0., ones_like_inp * ctx.constant,
ones_like_inp), None
class Elu(nn.Module):
def __init__(self, alpha=1):
super().__init__()
self.alpha = alpha
def forward(self, x):
out = SelfDefinedElu.apply(x, self.alpha)
return out
# self defined
torch.manual_seed(0)
alpha = 0.5 # greater so could have bettrer visualization
elu = Elu(alpha=alpha) # SelfDefinedLeakyRelu
inp = torch.randn(5, requires_grad=True)
out = elu((inp + 1).pow(3))
print(f'Out is\n{out}')
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nFirst call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
Out is
tensor([ 1.6406e+01, 3.5275e-01, -4.0281e-01, 3.8583e+00, -3.0184e-04],
grad_fn=)
First call
tensor([1.9370e+01, 1.4977e+00, 4.0513e-01, 7.3799e+00, 1.0710e-02])
Second call
tensor([3.8740e+01, 2.9955e+00, 8.1027e-01, 1.4760e+01, 2.1419e-02])
Call after zeroing gradients
tensor([1.9370e+01, 1.4977e+00, 4.0513e-01, 7.3799e+00, 1.0710e-02])
# torch defined
torch.manual_seed(0)
inp = torch.randn(5, requires_grad=True)
out = F.elu((inp + 1).pow(3), alpha=alpha)
print(f'Out is\n{out}')
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nFirst call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
Out is
tensor([ 1.6406e+01, 3.5275e-01, -4.0281e-01, 3.8583e+00, -3.0184e-04],
grad_fn=)
First call
tensor([1.9370e+01, 1.4977e+00, 4.0513e-01, 7.3799e+00, 1.0710e-02])
Second call
tensor([3.8740e+01, 2.9955e+00, 8.1027e-01, 1.4760e+01, 2.1419e-02])
Call after zeroing gradients
tensor([1.9370e+01, 1.4977e+00, 4.0513e-01, 7.3799e+00, 1.0710e-02])
inp = torch.arange(-1, 1, 0.05, requires_grad=True)
out = F.elu(inp, alpha=1.2)
# out = F.relu(inp)
out.mean(), out.std()
(tensor(0.0074, grad_fn=),
tensor(0.5384, grad_fn=))
inp = torch.arange(-1, 1, 0.05, requires_grad=True)
# out = F.elu(inp, alpha=1)
out = F.relu(inp)
out.mean(), out.std()
(tensor(0.2375, grad_fn=),
tensor(0.3170, grad_fn=))
# visualization
inp = torch.arange(-8, 8, 0.05, requires_grad=True)
out = elu(inp)
out.sum().backward()
inp_grad = inp.grad
plt.plot(inp.detach().numpy(),
out.detach().numpy(),
label=r"$elu(x)$",
alpha=0.7)
plt.plot(inp.detach().numpy(),
inp_grad.numpy(),
label=r"$elu'(x)$",
alpha=0.5)
plt.scatter(0, 0, color='None', marker='o', edgecolors='r', s=50)
plt.grid()
plt.legend()
plt.show()