pytorch学习

1.pytorch环境配置(docker)

  • docker环境--配置过程(配置成功--能使用nvidia-docker命令)
  • 拉取pytorch镜像--参考网站(命令--docker pull nvcr.io/nvidia/pytorch:19.01-py3)
  • 启动docker容器--参考网站(命令--sudo nvidia-docker run -it --rm -v local_dir:container_dir nvcr.io/nvidia/pytorch:-py3)

2. pytorch中tensor的基本操作

参考网站:https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html#sphx-glr-beginner-blitz-tensor-tutorial-py

2.1 pytorch测试

# coding=utf-8
# 这是一个用于练习的文档
from __future__ import print_function
import torch
x = torch.rand(5,3)
print("x={}".format(x)) #5行3列随机数
print(torch.empty(5,3))
print(torch.zeros(5,3,dtype=torch.long))
print(torch.tensor([5.5,3]))

x=x.new_ones(5,3,dtype=torch.double)
print("x={}".format(x))
x=torch.rand_like(x,dtype=torch.float) #Returns a tensor with the same size as input that is filled with random numbers from a normal distribution with mean 0 and variance 1
print("x={}".format(x))
print(x.dtype)
print(x.size())

2.2 tensor加法

# tensor加法(pytorch)
y=(torch.rand(5,3))
print("y={}".format(y))
print("x+y={}".format(x+y)) #相加就是对应位置值相加,也可以用torch.add(x,y)
print("torch.add(x,y)={}".format(torch.add(x,y)))

result=torch.empty(5,3)
torch.add(x,y,out=result)
print("result={}".format(result)) #带输出的加法,result必须是与x,y相同的类型

2.3  in-place方法

# in-place方法,即不添加多余变量,直接内部放置
y.add_(x)
print("y={}".format(y)) # y=y+x
# pytorch都可以通过添加_实现in-place操作,如x.copy_(y),x.t_()
print("x={}".format(x))
x.copy_(y)
print("x={}".format(x))
x.t_()
print("x={}".format(x)) # x转置赋给x

# tensor resize/reshape(torch.view)
x = torch.rand(4,4)
print("x={}".format(x))
y = x.view(16)
print("y=x.view(16)={}".format(y))
print("x.view(-1,8)={}".format(x.view(-1,8))) #负数索引一般表示从右(最后一个元素为-1)往左数,这里的-1解释为:the size -1 is inferred from other dimensions
print("x.view(-1,4)={}".format(x.view(-1,4))) #进一步验证:-1表示从另一个维度推算,another_dim=8,-1表示2; another_dim=4,-1表示4
#print("x.view(-1,5)={}".format(x.view(-1,5)))#RuntimeError: shape '[-1, 5]' is invalid for input of size 16
print("x.view(2,8)={}".format(x.view(2,8)))
print("x.view(8,2)={}".format(x.view(8,2)))

2.4 tensor resize/reshape 

# tensor resize/reshape(torch.view)
x = torch.rand(4,4)
print("x={}".format(x))
y = x.view(16)
print("y=x.view(16)={}".format(y))
print("x.view(-1,8)={}".format(x.view(-1,8))) #负数索引一般表示从右(最后一个元素为-1)往左数,这里的-1解释为:the size -1 is inferred from other dimensions
print("x.view(-1,4)={}".format(x.view(-1,4))) #进一步验证:-1表示从另一个维度推算,another_dim=8,-1表示2; another_dim=4,-1表示4
#print("x.view(-1,5)={}".format(x.view(-1,5)))#RuntimeError: shape '[-1, 5]' is invalid for input of size 16
print("x.view(2,8)={}".format(x.view(2,8)))
print("x.view(8,2)={}".format(x.view(8,2)))

2.5 获取tensor某一element的值

# 获取某一个元素的值
print("x[1][1]={}".format(x[1][1]))
print("x[1][1].item()={}".format(x[1][1].item()))

x=torch.randn(1)
print("x={}".format(x))
print("x.item()={}".format(x.item()))
print("x[0].item()={}".format(x[0].item()))

2.6 转换Torch tensor到numpy

# 转换Torch tensor到numpy
a = torch.ones(6)
print("a={},type is {}".format(a,type(a)))
b = a.numpy()
print("b=a.numpy()={},type is {}".format(b,type(b)))

# numpy中的加法:改变numpy的值,只需要改变Torch tensor对应的值即可
#b.add_(2)#AttributeError: 'numpy.ndarray' object has no attribute 'add_'
print("b+2={}".format(b+2))
a.add_(3)
print("a={},type is {}".format(a,type(a)))
print("b=a.numpy()={},type is {}".format(b,type(b)))

2.7 转换numpy到Torch tensor

# 转换numpy到Torch tensor
import numpy as np
a = np.ones(2)
print("a={},type is {}".format(a,type(a)))
b = torch.from_numpy(a)
print("b=torch.from_numpy(a)={},type is {}".format(b,type(b)))

# 改变numpy,会改变Torch tensor的值吗?经过验证,答案是“会”
np.add(a,1,out=a) # out --- A location into which the result is stored
print("a={},type is {}".format(a,type(a)))
print("b={},type is {}".format(b,type(b)))

2.8 tensor传入GPU

# pytorch cuda tensors
if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x,device=device) #直接创建一个与x大小相同的tensor,放于GPU上
    print("\nx={},dtype is {}".format(x,x.dtype))
    print("y=torch.ones_like(x,device=device)={},dtype is {}".format(y,y.dtype))
    #z = x+y #RuntimeError: expected type torch.FloatTensor but got torch.cuda.FloatTensor一个在GPU上,一个在CPU上,无法运算
    x = x.to(device) # 将x传到GPU上
    print("x=x.to(device)={},dtype is {}".format(x,x.dtype))
    z = x + y
    print("z=x+y={},dtype is {}".format(z,z.dtype))

3.pytorch反向传播

3.1 requires_grad和grad_fn

# coding=utf-8
import torch
x = torch.ones(2,2,requires_grad=True)
print("x={}".format(x))
y = x + 3
print("y={},y.grad_fn={}".format(y,y.grad_fn)) #y.grad_fn=
z = x * y * 4
print("z={},z.grad_fn={}".format(z,z.grad_fn)) #z.grad_fn=

3.2 反向传播backward(标量对向量求导)

要实现反向传播,求偏导的自变量(tensor) 必须满足条件:a.requires_grad=True(以下述snippet为例)

# coding=utf-8
import torch

a = torch.randn(2,2)
a = (a*3/(a-1))
print("a.requires_grad is {}".format(a.requires_grad))  #The input flag defaults to False if not given默认Torch tensor是不需要梯度的,即a.requires_grad=False

#a.requires_grad_(True) #验证求偏导的自变量(tensor)的requires_grad=True,否则无法反向传播

b = a.sum()
print("b=a.sum()={},b.grad_fn is {}".format(b,b.grad_fn))
# 反向传播backward
#b.backward()  # out.backward() is equivalent to out.backward(torch.tensor(1.)) a.requires_grad=False 报错--RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

正确例子如下:

c = \frac{1}{4}\sum a_{i},因此,\frac{\partial c}{\partial a}=\begin{bmatrix} \frac{\partial c}{\partial a_{0}}& \frac{\partial c}{\partial a_{1}}\\ \frac{\partial c}{\partial a_{2}}& \frac{\partial c}{\partial a_{3}} \end{bmatrix}=\begin{bmatrix} 0.25& 0.25\\ 0.25& 0.25 \end{bmatrix}

# coding=utf-8
import torch

a = torch.randn(2,2)
a = (a*3/(a-1))
print("a.requires_grad is {}".format(a.requires_grad))  #The input flag defaults to False if not given默认Torch tensor是不需要梯度的,即a.requires_grad=False

a.requires_grad_(True) #验证求偏导的自变量(tensor)的requires_grad=True,否则无法反向传播
print("a.requires_grad is {}".format(a.requires_grad)) #满足a.requires_grad=True
print("\na={}".format(a))

# 标量(saclar)对向量(vector)求偏导
c = a.mean()
print("c=a.mean()={}".format(c))
c.backward()  #实现了求平均值的函数的反向求导
print("a.grad={}".format(a.grad))  '''a.grad=tensor([[0.2500, 0.2500],[0.2500, 0.2500]])'''

b = a.sum()
b.backward()
print("a.grad={}".format(a.grad)) '''a.grad=tensor([[1.2500, 1.2500],[1.2500, 1.2500]])累加了!'''

如果多次使用backward(),则a.grad会被累加运算!

3.3 范数求解

# 范数求解
x = torch.randn(3,requires_grad=True)
y = x * 2
while y.data.norm()<10: #这里默认求解2-范数,也可以通过torch.norm(y,2)求解2-范数
    y = y * 2
print("\ny={},y.data={},y.data.norm()={},torch.norm(y,2)={}".format(y,y.data,y.data.norm(),torch.norm(y,2)))

3.4 反向传播backward(向量对向量求导)

理论公式推导可参考矩阵求导

x=(x_{1},x_{2},x_{3}),y=(y_{1},y_{2},y_{3}),\frac{\partial y^{T}}{\partial x}=J^{T}=\bigl(\begin{smallmatrix} \frac{\partial y_{1}}{\partial x_{1}}& \frac{\partial y_{2}}{\partial x_{1}}& \frac{\partial y_{3}}{\partial x_{1}}\\ \frac{\partial y_{1}}{\partial x_{2}}& \frac{\partial y_{2}}{\partial x_{2}}&\frac{\partial y_{3}}{\partial x_{2}} \\ \frac{\partial y_{1}}{\partial x_{3}}& \frac{\partial y_{2}}{\partial x_{3}}& \frac{\partial y_{3}}{\partial x_{3}} \end{smallmatrix}\bigr)

'''
# 向量(vector)对向量(vector)求偏导,backward(),数学上1*3的vector对1*3的vector求导会得到一个3*3的矩阵(vector-Jacobian product),但这里需要加一个向量v
# 可参考https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
'''
v = torch.tensor([1.2,0.4,0.003],dtype=torch.float)
#y.backward() #这是计算标量倒数的方法,报错--RuntimeError: grad can be implicitly created only for scalar outputs
y.backward(v) #传入一个1*3的tensor相当于vector-Jacobian的转置*v的转置
print("x.grad={}".format(x.grad))

上述程序得到y=x*8,求导所得\frac{\partial y^{T}}{\partial x}=J^{T}=\bigl(\begin{smallmatrix} 8& 0& 0\\ 0& 8&0\\ 0& 0& 8 \end{smallmatrix}\bigr)

x=tensor([ 1.3471, -0.0893, -0.7166], requires_grad=True)

y=tensor([10.7771, -0.7143, -5.7330], grad_fn=)

x.grad=tensor([9.6000, 3.2000, 0.0240])

3.5 停止自动求导

# 停止自动求导运算
print("(x**2).requires_grad={}".format((x ** 2).requires_grad)) # True
with torch.no_grad():
    print("(x**2).requires_grad={}".format((x ** 2).requires_grad)) # False
print("(x**2).requires_grad={}".format((x ** 2).requires_grad)) # True

4.神经网络

公式推导可以参考神经网络

4.1 基本要求

A typical training procedure for a neural network is as follows:

  • Define the neural network that has some learnable parameters (or weights)  定义神经网络(拥有一些可学习的参数)
  • Iterate over a dataset of inputs  在输入数据集上进行迭代
  • Process input through the network  通过网络处理输入
  • Compute the loss (how far is the output from being correct)  计算损失
  • Propagate gradients back into the network’s parameters  反向传播梯度给网络参数
  • Update the weights of the network, typically using a simple update rule: weight = weight -learning_rate * gradient 更新网络权重

4.2 实现步骤

  • 定义神经网络:
# coding=utf-8
'''
卷积、全连接在torch.nn,池化在torch.nn.functional
'''
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = nn.Conv2d(1,3,2) # in_channels=1,out_channels=3,kernel_size=2*2,后面默认stride=1,padding=0,dilation=1,group=1,bias=True
        self.conv2 = nn.Conv2d(3,3,2)
        # 全连接层
        self.fc1 = nn.Linear(3*7*7,5) #3通道,conv2计算1*32*32的矩阵后得到7*7大小的feature map,即上层输出3*7*7个数,有3*7*7个神经元
        self.fc2 = nn.Linear(5,4)
        self.fc3 = nn.Linear(4,3)
        '''
        以下为官网设置
        '''
        #self.conv1 = nn.Conv2d(1,6,5) # in_channels=1,out_channels=3,kernel_size=2*2,后面默认stride=1,padding=0,dilation=1,group=1,bias=True
        #self.conv2 = nn.Conv2d(6,16,5)
        #self.fc1 = nn.Linear(16*5*5,120) #16通道,5*5大小的feature map,即上层有16*5*5个神经元
        #self.fc2 = nn.Linear(120,84)
        #self.fc3 = nn.Linear(84,3)

    def forward(self,x):
        #import pdb;pdb.set_trace()
        x = F.max_pool2d(F.relu(self.conv1(x)),(2,2)) # 1.卷积;2.relu;3.最大池化(2*2)
        x = F.max_pool2d(F.relu(self.conv2(x)),2) # 设置为2,其实就是(2,2),# If the size is a square you can only specify a single number
        x = x.view(-1,self.num_flat_features(x)) # reshape/resize,这里就是平铺x成1维
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        #import pdb;pdb.set_trace()
        x = self.fc3(x) # 这里就是把全连接层前面的所有神经元(不管2维还是多维)平铺成1维
    def num_flat_features(self,x):
        size = x.size()[1:] # 除了batch维度外的所有维度
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
        return x
  • 查看网络:
net = Net() # 网络实例化
print("net:{}".format(net)) # 查看网络构成
param = list(net.parameters())
print("length:{}\nparam[0].size()={}\nparam[1].size()={}\nparam[2].size()={}\nparam[3].size()={}\nparam[4].size()={}\nparam[5].size()={}\n".format(len(param),param[0].size(),param[1].size(),param[2].size(),param[3].size(),param[4].size(),param[5].size())) # 这里的length:10,因为每一个tensor都有weight和bias
  • 数据输入网络:
'''
# input--The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample
# 就是需要实现对齐,举个例子:a single sample 可能就是3*32*32(nChannels*Height*Width),a mini-batch of samples就是1*3*32*32(nSamples*nChannels*Height*Width))
# 如果输入是a single sample,则需要转换为a mini-batch of samples(虚构一个nSamples的维度)
'''
input = torch.randn(1,1,32,32)
print("input=torch.randn(1,1,32,32)={}".format(input))
out = net(input) # 自动执行forward函数
print("out=net(input)={},size={}".format(out,out.size()))
  • 计算损失:
net.zero_grad() # Zero the gradient buffers of all parameters将所有梯度设置为0,Sets gradients of all model parameters to zero.

#out.backward(torch.randn(1,3),retain_graph=True)

target = torch.tensor([0.8,0.1,0.1],dtype=torch.float)
print("target变换前:{},size={}".format(target,target.size()))
target = target.view(1,-1) # 需要将target转换成与out相同维度
print("target变换后={},size={}".format(target,target.size()))

# loss function(mean-squared error)使用类nn.MSELoss
mse_loss = nn.MSELoss()    #1/3*((y0-t0)^2+(y1-t1)^2+(y2-t2)^2),其中y为fc3层计算输出,t为目标标签
#loss = mse_loss(target,out) #loss.grad_fn:这里必须是out,target的顺序
loss = mse_loss(out,target) #loss.grad_fn:这里必须是out,target的顺序
print("loss={}\nloss.grad_fn:{}\nloss.grad_fn.next_functions[0][0]={}\nloss.grad_fn.next_functions[0][0].next_functions[0][0]={}".format(loss,loss.grad_fn,loss.grad_fn.next_functions[0][0],loss.grad_fn.next_functions[0][0].next_functions[0][0]))
  • 反向传播求梯度:
# backprop
print("\nbefore zero_grad---net.conv1.bias.grad={}".format(net.conv1.bias.grad))

net.zero_grad()
print("\nbefore backprop---net.conv1.bias.grad={}".format(net.conv1.bias.grad))
'''
# pytorch构建的一个graph中,只能进行一次backward,如果上述过程已经使用过一次,则会报错:RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.
'''
loss.backward() # 如果前面使用过backward,则前面的backward添加retain_graph=True,即可
print("\nafter backprop---net.conv1.bias.grad={}".format(net.conv1.bias.grad))
  • 两种迭代方式(更新weight和bias):
# a simple implementing method 试一下多次迭代!!!!Method 1
learning_rate = 0.01
iter_count = 0
while loss>0.00001:
    net.zero_grad()
    output = net(input)
    loss = mse_loss(output,target)
    loss.backward()
    for f in net.parameters():
        f.data.sub_(f.grad.data * learning_rate)
    iter_count = iter_count + 1
    if iter_count%30 == 0:
        print("第{}次迭代,loss:{}".format(iter_count,loss))

'''
# torch.optim优化,试一下多次迭代!!!!             Method 2
iter_count = 0
import torch.optim as optim
while loss>0.000000001:
    opt = optim.SGD(net.parameters(),lr=0.01)
    opt.zero_grad() # 每一次迭代都需要将梯度缓存改为0,否则会导致梯度叠加问题
    output = net(input)
    loss = mse_loss(output,target)
    loss.backward()
    opt.step()
    iter_count = iter_count + 1
    if iter_count%30 == 0:
        print("第{}次迭代,loss:{}".format(iter_count,loss))
'''

5.分类网络(CIFAR10)

5.1 基本步骤

  • Load and normalizing the CIFAR10 training and test datasets using torchvision 下载并载入cifar10的数据
  • Define a Convolutional Neural Network  定义分类网络
  • Define a loss function   定义损失函数
  • Train the network on the training data  训练数据
  • Test the network on the test data  测试

5.2 实现方法

  • 下载cifar10数据:
# coding=utf-8
import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))]) 
'''
#Compose组合tensor到一起,transforms.ToTensor()--转换一个PIL图像到tensor,Convert a PIL Image or numpy.ndarray to tensor;
#transforms.Normalize(mean,std)--规范化一个tensor图像,input[channel] = (input[channel] - mean[channel]) / std[channel]
'''
traindatasets = torchvision.datasets.CIFAR10(root="./data",train=True,download=False,transform=transform) 
#从root目录读取,download=True则先下载再读取;train=True则表示读取train数据集,否则读取test数据集;按照transform设定的方式读取返回至traindatasets
trainloader = torch.utils.data.DataLoader(traindatasets,batch_size=4,shuffle=True,num_workers=2) #num_workers--用于数据载入的subprocesses数量

testdatasets = torchvision.datasets.CIFAR10(root="./data",train=False,download=False,transform=transform)
testLoader = torch.utils.data.DataLoader(testdatasets,batch_size=4,shuffle=True,num_workers=2)

classes = ("plane","car","bird","cat","deer","dog","frog","horse","ship","truck")

import matplotlib.pyplot as plt
import numpy as np
def imshow(img):
    img = img/2 +0.5 #载入的图片input[channel] = (input[channel] - mean[channel]) / std[channel],所以这里是input[channel]=std[channel]*input[channel]+mean[channel]
    np_img = img.numpy() #转换为numpy格式
    plt.imshow(np.transpose(np_img,(1,2,0)))
    plt.savefig("1.jpg")
#dataiter = iter(trainloader)
#images,labels = dataiter.next()
#
##imshow(torchvision.utils.make_grid(images)) # make a grid of images图像网格,images是一个tensor,所以imshow函数里面需要转换为numpy格式的
#print(' '.join("%5s"% classes[labels[i]] for i in range(4)))
  • 定义分类网络:
traindatasets = torchvision.datasets.CIFAR10(root="./data",train=True,download=False,transform=transform) 
#从root目录读取,download=True则先下载再读取;train=True则表示读取train数据集,否则读取test数据集;按照transform设定的方式读取返回至traindatasets
trainloader = torch.utils.data.DataLoader(traindatasets,batch_size=1,shuffle=True,num_workers=2) #num_workers--用于数据载入的subprocesses数量

testdatasets = torchvision.datasets.CIFAR10(root="./data",train=False,download=False,transform=transform)
testloader = torch.utils.data.DataLoader(testdatasets,batch_size=4,shuffle=True,num_workers=2)

classes = ("plane","car","bird","cat","deer","dog","frog","horse","ship","truck")
import torch.nn as nn
import torch.nn.functional as F

class ClassifyNet(nn.Module):
    def __init__(self):
        super(ClassifyNet,self).__init__()
        self.conv1 = nn.Conv2d(3,6,3)  #input channels=3   output channels = 5   kernel=3*3
        self.conv2 = nn.Conv2d(6,10,3)
        self.pool = nn.MaxPool2d(2,2)
        self.fc1 = nn.Linear(1960,120) #1960 = 1*10*14*14 = batch_size*channels*width*height
        self.fc2 = nn.Linear(120,60)  #torch.nn.Linear(in_features, out_features, bias=True)
        self.fc3 = nn.Linear(60,10)

    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        #import pdb; pdb.set_trace()
        x = x.view(-1,1960)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

①这里重新载入数据集,采用batch_size=1,即训练过程保持单张图训练,速度慢。但学习阶段,需要慢慢搞懂每一步!

②fc1的定义需要计算图像计算到这一层的[batchsize,channels,height,width],然后设置当前层的in_features,即输入神经元个数。

③前向传播forward计算到fc1时,需要平铺卷积高维torch tensors。

  • 定义损失函数:

net = ClassifyNet() #实例化网络
print(net)

# define loss
loss_cross = nn.CrossEntropyLoss()
import torch.optim as optim
opt = optim.SGD(net.parameters(),lr=0.001)
running_loss = 0
  • 训练分类网络(利用cifar10):
# train step
for i,data in enumerate(trainloader,0): # 把trainloader对象组合为一个索引序列,所以下标从0开始,把trainloader中所有的训练数据训练一遍
    inputs,labels = data
    opt.zero_grad()
    outputs = net(inputs)
    #import pdb;pdb.set_trace()
    loss = loss_cross(outputs,labels)
    loss.backward()
    opt.step()
    running_loss += loss.item()
    if i%2000 == 1999:
        print("第{}次迭代,loss:{}".format(i+1,running_loss/2000))
    running_loss = 0

print("Finish Training")

训练图片50000张,迭代50000次:

……
第46000次迭代,loss:0.000776898443698883
第48000次迭代,loss:0.001786381721496582
第50000次迭代,loss:0.0005364646911621094

  • 测试分类网络:
# test step
testdataiter = iter(testloader)
images,labels = testdataiter.next()
imshow(torchvision.utils.make_grid(images))
print("GT:",' '.join("%5s"% classes[labels[i]] for i in range(4)))

outputs = net(images) # testdataset设置的batchsize为4,则计算得到的outputs也有4个10维输出
_, predicts = torch.max(outputs,1) # Returns the maximum value of each row of the input tensor in the given dimension dim这里在维度序号为1的list中的最大值,即每一个1*10list中的最大值
print(predicts)
print("Predicts:",' '.join("%5s"% classes[predicts[i]] for i in range(4)))

这里仅仅测试了4张图,预测都是正确的。

GT:   car  bird  frog   dog

Predicts:   car  bird  frog   dog

在测试集上测试:(正确率0.4979)

# test on testdatasets
correct = 0
total = 0
with torch.no_grad(): # 不需要求解梯度
    for i,data in enumerate(testloader,0):
        images,labels = data
        outputs = net(images)
        _, predicts = torch.max(outputs,1)
        total += labels.size(0)
        correct = correct + (predicts == labels).sum().item() #predicts与labels相同则为1,求和即得正确预测的个数
print("The accuracy of classifyNet on {} test images:{}".format(total,correct/total))
  • 在GPU上训练:
# train step
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:{}".format(device))
for i,data in enumerate(trainloader,0): # 把trainloader对象组合为一个索引序列,所以下标从0开始,把trainloader中所有的训练数据训练一遍
    net.to(device) #①将网络放到GPU上
    inputs,labels = data
    inputs,labels = inputs.to(device),labels.to(device) #②将数据放到GPU上
    opt.zero_grad()
    outputs = net(inputs)
    #import pdb;pdb.set_trace()
    loss = loss_cross(outputs,labels)
    loss.backward()
    opt.step()
    running_loss += loss.item()
    if i%2000 == 1999:
        print("第{}次迭代,loss:{}".format(i+1,running_loss/2000))
    running_loss = 0

print("Finish Training")

①将网络net放到GPU上;

②将需要训练的数据放到GPU上。

在GPU上训练所需时间:----real    4m8.954s----user    5m22.688s----sys    0m49.932s----

在CPU上训练所需时间:----real    2m15.048s----user    9m12.332s----sys    15m32.658s----

CPU更快!!!奇怪不奇怪!!!官网解释:Why dont I notice MASSIVE speedup compared to CPU? Because your network is realllly small.

 

6.分类网络(自定义数据)

6.1 数据构成

数据来源:ImageNet

数据类别:dog 和 cat

数据放置:train文件夹下放2个文件夹(cat 和 dog),每个文件夹分别放各自的图片。val文件夹做同样的操作。但是train和val中放置的图片一般不能有重复的图片。

6.2 数据读取

参考pytorch官网(github)给出的一个例子,数据会被很规范的读入,类似CIFAR10一样,train和val下面的文件夹名字自然会被分为0和1两类:

# Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
  CIFAR10 myDatasets
function traindatasets=torchvision.datasets.CIFAR10() mytraindatasets = torchvision.datasets.ImageFolder()
pytorch structure

Dataset CIFAR10
    Number of datapoints: 50000
    Split: train
    Root Location: ./data
    Transforms (if any): Compose(
                             ToTensor()
                             Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
                         )
    Target Transforms (if any): None

Dataset ImageFolder
    Number of datapoints: 1876
    Root Location: ./data/mydatasets/datasets/train
    Transforms (if any): Compose(
                             RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=PIL.Image.BILINEAR)
                             RandomHorizontalFlip(p=0.5)
                             ToTensor()
                             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                         )
    Target Transforms (if any): None
DataLoader

torch.utils.data.DataLoader(mytraindatasets, batch_size=1, shuffle=True, num_workers=0)

官网说法:Combines a dataset and a sampler, and provides single- or multi-process iterators over the dataset.

获取其中一个数据traindatasets.__getitem__(0)

矩阵是图像,6是类别标签

(tensor([[[-0.5373, -0.6627, -0.6078,  ...,  0.2392,  0.1922,  0.1608],
         [-0.8745, -1.0000, -0.8588,  ..., -0.0353, -0.0667, -0.0431],
         [-0.8039, -0.8745, -0.6157,  ..., -0.0745, -0.0588, -0.1451],
         ...,
         [-0.2471, -0.7333, -0.7961,  ..., -0.4510, -0.9451, -0.8431],
         [-0.2471, -0.6706, -0.7647,  ..., -0.2627, -0.7333, -0.7333],
         [-0.0902, -0.2627, -0.3176,  ...,  0.0980, -0.3412, -0.4353]]]), 6)

7. pytorch网络可视化(docker下的tensorboard)

  • 安装自然很简单:
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tensorflow tensorboard tensorboardX
  • TensorBoard运行在docker容器下:

因为TensorBoard默认运行端口在6006,如果在docker下直接运行,则使用浏览器访问TensorBoard时,无法访问docker容器下的TensorBoard服务器,只能访问主机的TensorBoard。因此需要把docker容器的6006端口映射到主机,进而访问主机的TensorBoard服务器时,间接访问docker容器的6006端口服务资源。(-p 6006:6006)

 sudo nvidia-docker run --rm -it -v /media/lab/873821cf-d234-44cf-bd63-4372eac823a1/pytorch/:/home/pytorch -p 6006:6006 pytorch:v0 bash
  • TensorBoard网络可视化构建(代码):
# coding=utf-8
import torch
import torchvision
import torchvision.transforms as transforms
from visualization import visualize

import torch.nn as nn
import torch.nn.functional as F

class ClassifyNet(nn.Module):
    def __init__(self):
        super(ClassifyNet,self).__init__()
        self.conv1 = nn.Conv2d(3,4,3)  #input channels=3   output channels = 4   kernel=3*3
        self.pool = nn.MaxPool2d(2,2)
        self.fc1 = nn.Linear(49284,60) #1960 = 1*4*111*111 = batch_size*channels*width*height
        self.fc2 = nn.Linear(60,2)

    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        #import pdb;pdb.set_trace()
        x = x.view(-1,49284)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

net = ClassifyNet()
print(net)

'''
visualization
method:tensorbordX
'''

from tensorboardX import SummaryWriter
with SummaryWriter(comment="Net") as w:
    w.add_graph(net,(torch.rand(1,3,224,224),))

程序运行之后,当前程序所在目录下会生成一个runs目录 

  • 运行TensorBoard服务器资源:
tensorboard --logdir=runs/
  • 局域网下的浏览器访问TensorBoard(主机地址+端口):
host_addr:6006  

 

你可能感兴趣的:(python,deep,learning,Algorithm,docker)