是一个向量,代表函数在某一个点增长的方向,模代表了在该点增长的速率
影响搜索过程的因素:
初始状态,学习率,动量
1.torch.sigmoid
(0,1)
a=torch.linspace(-100,100,10)
print(a)
print(torch.sigmoid(a))
tensor([-100.0000, -77.7778, -55.5556, -33.3333, -11.1111, 11.1111,
33.3333, 55.5556, 77.7778, 100.0000])
tensor([0.0000e+00, 1.6655e-34, 7.4564e-25, 3.3382e-15, 1.4945e-05, 9.9999e-01,
1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00])
2.tanh (-1,1)
常用于RNN
tensor([-100.0000, -77.7778, -55.5556, -33.3333, -11.1111, 11.1111,
33.3333, 55.5556, 77.7778, 100.0000])
tensor([0.0000e+00, 1.6655e-34, 7.4564e-25, 3.3382e-15, 1.4945e-05, 9.9999e-01,
1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00])
tensor([-1.0000, -0.7778, -0.5556, -0.3333, -0.1111, 0.1111, 0.3333, 0.5556,
0.7778, 1.0000])
tensor([-0.7616, -0.6514, -0.5047, -0.3215, -0.1107, 0.1107, 0.3215, 0.5047,
0.6514, 0.7616])
3.ReLU(Recrified Linear Unit)
常用语Deep learning
减少了梯度离散和梯度爆炸的情况
c=torch.linspace(-1,1,10)
print(torch.relu(c))
print(F.relu(c))
tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1111, 0.3333, 0.5556, 0.7778,
1.0000])
tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1111, 0.3333, 0.5556, 0.7778,
1.0000])
1.MSE 均方差
自动求导 autograd.grad
x=torch.ones(1)
w=torch.full([1],2,dtype=torch.float)
mse=F.mse_loss(torch.ones(1),x*w)
#更新w
w=w.requires_grad_()
##更新图
mse=F.mse_loss(torch.ones(1),x*w)
f=torch.autograd.grad(mse,[w])
print(f)
(tensor([2.]),)
loss.backward
向后传播,计算梯度
第二次调用backward时会清楚之前的梯度信息,需要添加retain_graph=True,但只会保持一次,下下次再用需要再下次添加。
print(w.grad)
(tensor([2.]),)
Gradient API
torch.autograd.grad(loss,[w1,w2,……])
[w1.grad w2.grad])
loss.backward()
w2.grad
w1.grad
2.激活函数 softmax
soft version of max
a = torch.rand(3)
a.requires_grad_()
print(a)
p = F.softmax(a, dim=0)
p[0].backward(retain_graph=True)#使用retain_graph = True属性保存状态图,以便下一次调用backward()
print(a.grad)
p[1].backward(retain_graph=True)
print(a.grad)
p[2].backward()
print(a.grad)
#another cal grad method
p = F.softmax(a, dim=0)
print(torch.autograd.grad(p[0], [a], retain_graph=True))
print(torch.autograd.grad(p[1], [a], retain_graph=True))
print(torch.autograd.grad(p[2], [a]))
tensor([0.6381, 0.8929, 0.3238], requires_grad=True)
tensor([ 0.2215, -0.1414, -0.0800])
tensor([ 0.0800, 0.1033, -0.1833])
tensor([ 0.0000e+00, -7.4506e-09, 0.0000e+00])
(tensor([ 0.2215, -0.1414, -0.0800]),)
(tensor([-0.1414, 0.2447, -0.1033]),)
(tensor([-0.0800, -0.1033, 0.1833]),)
Process finished with exit code 0
1.10号节点输入的单层感知机
import torch
from torch.nn import functional as F
x=torch.randn(1,10)
w=torch.randn(1,10,requires_grad=True)
o=torch.sigmoid(w@w.t())
print(o.shape)
torch.Size([1,1])
loss=F.mse_loss(torch.ones(1,1),o)
print(loss.shape)
loss.backward()
print(w.grad)
torch.Size([1, 1])
torch.Size([])
tensor([[-2.6664e-09, 8.2928e-09, 5.7307e-09, -3.8164e-09, -8.8355e-10,
-5.4500e-09, -2.9995e-09, -7.5397e-09, -2.8947e-09, -1.9036e-09]])
2.MLP及梯度
a=torch.randn(1,10)
b=torch.randn(2,10,requires_grad=True)
c=torch.sigmoid(a@b.t())
print(c.shape)
torch.Size([1,2])
lloss=F.mse_loss(torch.ones(1,2),c)
print(lloss)
lloss.backward()
print(b.grad)
torch.Size([1, 2])
tensor(0.7512, grad_fn=)
tensor([[-0.0309, -0.2722, 0.0846, 0.0901, 0.1625, 0.0634, 0.0376, 0.0433,
-0.1486, 0.1573],
[-0.0234, -0.2058, 0.0639, 0.0681, 0.1228, 0.0479, 0.0284, 0.0327,
-0.1123, 0.1189]])
链式法则验证
import torch
import operator
x=torch.tensor(1.)
w1=torch.tensor(2.,requires_grad=True)
b1=torch.tensor(1.)
w2=torch.tensor(2.,requires_grad=True)
b2=torch.tensor(1.)
y1=x*w1+b1
y2=y1*w2+b2
dy2_dy1=torch.autograd.grad(y2,[y1],retain_graph=True)[0]
dy1_dw1=torch.autograd.grad(y1,[w1],retain_graph=True)[0]
dy2_dw1=torch.autograd.grad(y2,[w1],retain_graph=True)[0]
print(dy2_dy1*dy1_dw1)
print(dy2_dw1)
tensor(2.)
tensor(2.)
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torch
def himmebblau(x):
return (x[0]**2+x[1]-11)**2+(x[0]+x[1]**2-7)**2
x=np.arange(-6,6,0.1)
y=np.arange(-6,6,0.1)
print('x,y range:',x.shape,y.shape)
##形成坐标轴
X,Y=np.meshgrid(x,y)
print('X,Y maps:',X.shape,Y.shape)
Z=himmebblau([X,Y])
fig=plt.figure('himmelblau')
ax=fig.gca(projection='3d')
ax.plot_surface(X,Y,Z)
ax.view_init(60,-30)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()
##利用随机梯度函数求解
x=torch.tensor([0.,0.],requires_grad=True)
optimizer=torch.optim.Adam([x],lr=1e-3)
for step in range(20000):
pred=himmebblau(x)
optimizer.zero_grad()
pred.backward()
optimizer.step()
if step%2000==0:
print('step{},f(x)={}'.format(step,x.tolist(),pred.item()))
1.熵:不确定性→惊喜性
越大越稳定
越小越不稳定,越惊喜
例子:买彩票中奖率的交叉熵
import torch
a=torch.full([4],1/4.)
b=a*torch.log2(a)
print(b)
b=-(a*torch.log2(a)).sum()
print(b)
c=torch.tensor([0.1,0.1,0.1,0.7])
d=-(a*torch.log2(c)).sum()
print(d)
e=torch.tensor([0.001,0.001,0.001,0.999])
f=-(a*torch.log2(e)).sum()
print(f)
tensor([-0.5000, -0.5000, -0.5000, -0.5000])
tensor(2.)
tensor(2.6201)
tensor(7.4747)
2.交叉熵
F.cross_entropy(logits,torch.tensor([3]))
F.cross_entropy必须与logits联用
x=torch.randn(1,784)
w=torch.randn(10,784)
logits=x@w.t()
pred=F.softmax(logits,dim=1)
torch.Size([1,10])
pred_log=torch.log(pred)
F.cross_entropy(logits,torch.tensor([3]))
z=F.nll_loss(pred_log,torch.tensor([3]))
print(z)
tensor(34.1527)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
batch_size=200
learning_rate=0.01
epochs=10
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
w1, b1 = torch.randn(200, 784, requires_grad=True),\
torch.zeros(200, requires_grad=True)
w2, b2 = torch.randn(200, 200, requires_grad=True),\
torch.zeros(200, requires_grad=True)
w3, b3 = torch.randn(10, 200, requires_grad=True),\
torch.zeros(10, requires_grad=True)
# torch.nn.init.kaiming_normal_(w1)
# torch.nn.init.kaiming_normal_(w2)
# torch.nn.init.kaiming_normal_(w3)
def forward(x):
x = x@w1.t() + b1
x = F.relu(x)
x = x@w2.t() + b2
x = F.relu(x)
x = x@w3.t() + b3
x = F.relu(x)
return x
optimizer = optim.SGD([w1, b1, w2, b2, w3, b3], lr=learning_rate)
criteon = nn.CrossEntropyLoss()
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28*28)
logits = forward(data)
loss = criteon(logits, target)
optimizer.zero_grad()
loss.backward()
# print(w1.grad.norm(), w2.grad.norm())
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
logits = forward(data)
test_loss += criteon(logits, target).item()
pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
Step1
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
Step2
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
##nn.Sequential相当于一个容器,可以添加任何继承自nn.Module的类
self.model=nn.Sequential(
nn.Linear(784,200),
nn.ReLU(inplace=True)
nn.Linear(200,200)
nn.ReLU(inplace=True),
nn.Linear(200,10),
nn.ReLU(inplace=True),
)
Step 3
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
##nn.Sequential相当于一个容器,可以添加任何继承自nn.Module的类
self.model=nn.Sequential(
nn.Linear(784,200),
nn.ReLU(inplace=True)
nn.Linear(200,200)
nn.ReLU(inplace=True),
nn.Linear(200,10),
nn.ReLU(inplace=True),
)
def forward(self,x):
x=self.model(x)
return x
Train
net=MLP()
optimizer=optim.SGD(net.parameters(),lr=learing_rate)
criteon=nn.CrossEntropyLoss()
for epoch in range(epochs):
for batch_idx,(data,target) in enumerate(train_loader):
data=data.view(-1,28*28)
logits=net(data)
loss=criteon(logits,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
激活函数:
Tahn
sigmoid
ReLu
leaky ReLu
SELU(不常用)
softplus(不常用)
GPU加速
CPU设备: .to() 从CPU→GPU
device=torch.device('cuda:0')
##device 具体设备的类型
net=MLP.to(device)
optimizer=optim.SGD(net.parameters(),lr=learning_rate)
criteon=nn.CrossEntropyLoss().to()
for epoch in range(epochs):
for batch_idx,(data,target)in enumerate (train_loader):
data=data.view(-1,28*28)
data,target=data.to(device),target.to(target)
MNIST测试实战
argmax
argmin
argmax\argmin 可以得到预测值的标签,最大值或者最小值
logits.argmax(dim=1)
max 得到是预测的概率
logits=torch.rand(4,10)
pred=F.softmax(logits,dim=1)
print(pred.shape)
pred_label=pred.argmax(dim=1)
print(pred_label)
print(logits.argmax(dim=1))
label=torch.tensor([9,3,2,4])
correct=torch.eq(pred_label,label)
print(correct)
print(correct.sum().float().item()/4)
torch.Size([4, 10])
tensor([4, 7, 0, 4])
tensor([4, 7, 0, 4])
tensor([False, False, False, True])
0.25
when to test
test once per several batch
test once per epoch
Visidom可视化
##创建一个visdom实例
viz=Visdom()
##创建一个直线,win 作为一个ID,创建一个‘train_loss’的窗口, env也是一个ID,指定窗口
viz.line([0.],[0.],win='train_loss',opts=dict(title='train loss'))
##update添加一个直线
viz.line([loss.item()],[global_step],win='train_loss',update='append')
##创建多条曲线
##创建多条曲线
viz=Visdom()
viz.line([[1.1,0.0]],[0],win='test',opts=dict(title='test loss&acc.',legend=['loss','acc.']))
viz.line([[test_loss,correct/len(test_loader.dataset)]],[global_step],win='test',update='append')
##visdom中可视化功能
##Visual X
viz=Visdom()
viz.images(data.view(-1,1,28,28),win='x')
viz.text(str(pred.detach().cpu().numpy()),win='pred',opts=dict(title='pred'))