《动手学深度学习 v2》之预备、线性NN、多层感知机、深度学习计算

1.配置环境(AutoDl)

https://mp.csdn.net/mp_blog/creation/editor/new/128688185

2.预备知识

2.1. 数据操作

import torch
print(torch.__version__) #1.2.0
#2.1.1入门
x=torch.arange(12)
print(x)
print(x.shape)
print(x.numel())#元素总数
X=x.reshape(3,4)
print(X)
#
print(torch.zeros(2,3,4))
print(torch.ones(2,3,4))
print(torch.randn(3,4))#均值为0,标准差为1
print(torch.tensor([[2,1,4,3],[1,2,3,4],[4,3,2,1]]))
#2.1.2运算符
x=torch.tensor([1,2,4,8],dtype=torch.float)
y=torch.tensor([2,2,2,2],dtype=torch.float)
print(x*y)#x-y,x*y,x/y(对应元素)
print(torch.exp(x))#原因: torch.exp()操作不支持Long类型的张量作为输入 解决方法: 将张量转为浮点型即可, 执行

X=torch.arange(12,dtype=torch.float32).reshape(3,4)
Y=torch.tensor([[2.0,1,4,3],[1,2,3,4],[4,3,2,1]])
print(torch.cat((X,Y),dim=1))#dim=1
print(X==Y)
#2.1.3广播机制
a=torch.arange(3).reshape(3,1)
b=torch.arange(2).reshape(1,2)
print(a+b)
#2.1.4索引和切片
X=torch.arange(12,dtype=torch.float32).reshape(3,4)
print(X[-1],X[1:3])
X[1,2]=9#(1,2)处
print(X)
X[0:2,:]=12 #前两行
print(X)

import numpy as np
Z = np.zeros_like(X)
print(Z)
#2.1.5节省内存(原地更新)
X=torch.arange(12,dtype=torch.float32).reshape(3,4)
Y=torch.tensor([[2.0,1,4,3],[1,2,3,4],[4,3,2,1]])
before=id(Y) #运行一些操作可能会导致为新结果分配内存
Y=Y+X
print(id(Y)==before)

#首先,我们不想总是不必要地分配内存
#其次,如果我们不原地更新,其他引用仍然会指向旧的内存位置
#方式一:
print('id(Y):', id(Y))
Y[:] = X + Y
print('id(Y):', id(Y))

#方式二
print('id(X):', id(X))
X[:]=X+Y
print('id(X):', id(X))
X+=Y
print('id(X):', id(X))
#2.1.6转换为其他Python对象
X=torch.arange(12,dtype=torch.float32).reshape(3,4)
A=X.numpy()
B=torch.tensor(A)
print(type(A),type(B))

a=torch.tensor([3.5])
print(a,a.item(),float(a),int(a))#张量->标量

2.2. 数据预处理

#2.2.1 读取数据集
import os
os.makedirs(os.path.join('..','data'),exist_ok=True)
data_file=os.path.join('..','data','house_tiny.csv')
with open(data_file,'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

import pandas as pd
data=pd.read_csv(data_file)
print(data)

#2.2.2. 处理缺失值
inputs,outputs=data.iloc[:,0:2],data.iloc[:,2]
inputs=inputs.fillna(inputs.mean())#用同一列的均值替换“NaN”项
print(inputs)
#对于inputs中的类别值或离散值,我们将“NaN”视为一个类别。 由于“巷子类型”(“Alley”)列只接受两种类型的类别值“Pave”和“NaN”,
# pandas可以自动将此列转换为两列“Alley_Pave”和“Alley_nan”。
inputs=pd.get_dummies(inputs,dummy_na=True)
print(inputs)

#2.2.3. 转换为张量格式
X,y=torch.tensor(inputs.values),torch.tensor(outputs.values)
print(X,y)

2.3. 线性代数

#2.3.1. 标量
x = torch.tensor(3.0)
y = torch.tensor(2.0)
#x + y, x * y, x / y, x**y

#2.3.2. 向量以及长度、维度和形状
x = torch.arange(4)
print(x[3])
print(len(x))
print(x.shape)

#2.3.3. 矩阵
A = torch.arange(20).reshape(5, 4)
print(A.T)#矩阵的转置
B = torch.tensor([[1, 2, 3], [2, 0, 4], [3, 4, 5]])
print(B == B.T)

#2.3.4. 张量:向量是一阶张量,矩阵是二阶张量。
X = torch.arange(24).reshape(2, 3, 4)

#2.3.5. 张量算法的基本性质
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
B = A.clone()  # 通过分配新内存,将A的一个副本分配给B
#A, A + B
print(A * B)#按元素的乘法

a = 2
print(a + X)#(a * X).shape


#2.3.6. 降维
x = torch.arange(4, dtype=torch.float32)
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
print(x.sum(),A.sum())#任意形状张量的元素和

A_sum_axis0 = A.sum(axis=0)#通过求和所有行的元素来降维(轴0)
A_sum_axis1 = A.sum(axis=1)
#沿着行和列对矩阵求和,等价于对矩阵的所有元素进行求和。
A.sum(axis=[0, 1])  # 结果和A.sum()相同
# 2.3.6.1. 非降维求和
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
print(A)
sum_A = A.sum(axis=1, keepdims=True)
print(sum_A)
print(A / sum_A)#Broadcast
print(A.cumsum(axis=0)) #沿某个轴计算A元素的累积总和, 比如axis=0(按一行一行累计计算)

#2.3.7. 点积(Dot Product):相同位置的按元素乘积的和
x = torch.arange(4, dtype=torch.float32)
y = torch.ones(4, dtype = torch.float32)
print(torch.dot(x, y))
print(torch.dot(x, y)==torch.sum(x * y))



#2.3.8. 矩阵向量积:=>向量
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
x = torch.arange(4, dtype=torch.float32)
print(A,x)
z=torch.mv(A, x)# 注意,A的列维数(沿轴1的长度)必须与x的维数(其长度)相同。
print(z)#[m,n]*[n]->[m]
print(A.shape)
print(x.shape)
print(z.shape)


#2.3.9. 矩阵矩阵乘法
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
B = torch.ones(4, 3)
z=torch.mm(A, B)
print(z)
print(z.shape)

#2.3.10. 范数:将向量映射到标量
#L2范数:向量元素平方和的平方根
#L1范数:向量元素的绝对值之和
#F范数:是矩阵元素平方和的平方根(L2)
u = torch.tensor([3.0, -4.0])
print(torch.norm(u))
print(torch.abs(u).sum())
print(torch.norm(torch.ones((4, 9))))
#2.3.10.1. 范数和目标
#在深度学习中,我们经常试图解决优化问题: 最大化分配给观测数据的概率; 最小化预测和真实观测之间的距离。

2.4. 微积分

#2.4.1. 导数和微分
import numpy as np
from d2l import torch as d2l

def f(x):return 3*x**2-4*x #导数:6x-4
def numerical_lim(f,x,h):return (f(x+h)-f(x))/h

h=0.1
for i in range(5):
    print(f'h={h:.5f},numerical limit={numerical_lim(f,1,h):.5f}')
    h *=0.1

x=np.arange(0,3,0.1)
d2l.plot(x,[f(x),2*x-3],'x','f(x)',legend=['f(x)','Target f(x) x=1'])
d2l.plt.show()

#2.4.2. 偏导数
#2.4.3. 梯度
#2.4.4. 链式法则

2.5. 自动微分

import torch
print(torch.__version__) #1.2.0
#2.5.1. 一个简单的例子(标量变量)
#1)定义函数
x=torch.arange(4.0,requires_grad=True)
print(x.grad)  # 默认值是None
y = 2 * torch.dot(x, x)
#2)反向传播
y.backward()
#3)计算梯度
print(x.grad)
print(x.grad == 4 * x)

#2.5.2. 非标量变量的反向传播
# 0)清楚累计梯度(注意:在默认情况下,PyTorch会累积梯度,再次计算梯度时,我们需要清除之前的值)
x.grad.zero_()
#1)定义函数
y=x*x
#2)反向传播
#对非标量调用backward需要传入一个gradient参数,该参数指定微分函数关于self的梯度
#等价于y.backward(torch.ones(len(x)))
#等价于y.backward(torch.ones_like(x), retain_graph=True)
y.sum().backward()
#3)计算梯度
print(x.grad)

#2.5.3. 分离计算:将某些计算移动到记录的计算图之外
x.grad.zero_()
y = x * x
y.sum().backward()
print(x.grad == 2 * x)

###
x.grad.zero_()
u = y.detach()#这里分离y来返回一个新变量u,该变量与y具有相同的值, 但丢弃计算图中如何计算y的任何信息。换句话说,梯度不会向后流经u到x。
#注意:计算图是从下面开始,将u作为常数处理
z = u * x
z.sum().backward()#因此,下面的反向传播函数计算z=u*x关于x的偏导数,同时将u作为常数处理, 而不是z=x*x*x关于x的偏导数
print(x.grad == u)

#2.5.4. Python控制流的梯度计算
def f(a): #相当于f(a)=Ka
    b = a * 2
    while b.norm() < 1000:
        b = b * 2
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c
a=torch.randn(size=(),requires_grad=True)
y=f(a)
y.backward()
print(a.grad==y/a)

2.6. 概率

import torch
print(torch.__version__) #1.2.0
#2.6.1. 基本概率论
from torch.distributions import multinomial
from d2l import torch as d2l

#大数定理:概率如何随着时间的推移收敛到真实概率。
# 让我们进行500组实验,每组抽取10个样本。
fair_probs = torch.ones([6]) / 6  #1)为了抽取一个样本,即掷骰子,传入一个概率向量
counts = multinomial.Multinomial(10, fair_probs).sample((500,)) #2)输出是另一个相同长度的向量:它在索引i处的值是第j次采样结果中出现的次数
cum_counts = counts.cumsum(dim=0) #例如:tensor([[  0.,   2.,   1.,   4.,   2.,   1.],[  2.,   3.,   3.,   6.,   3.,   3.],,,,]
#cum_counts.sum(dim=1, keepdims=True) [[10],[20]]
estimates = cum_counts / cum_counts.sum(dim=1, keepdims=True)# 相对频率作为估计值

#区别
# cumsum(dim=0/1)做一行一行或一列一列累加,维度相比于之前不变
# sum(dim=0/1) 沿某一轴累加,维度相比于之前变化(keepdims=True会不变)
#[[1,2,3],[2,3,4]]
#cumsum(dim=0):[[1,2,3],[3,5,7]]
#sum(dim=0):[[3,5,7]]


d2l.set_figsize((6, 4.5))
for i in range(6):
    d2l.plt.plot(estimates[:, i].numpy(),
                 label=("P(i=" + str(i + 1) + ")"))
d2l.plt.axhline(y=0.167, color='black', linestyle='dashed')
d2l.plt.gca().set_xlabel('Groups of experiments')
d2l.plt.gca().set_ylabel('Estimated probability')
d2l.plt.legend()
d2l.plt.show()

《动手学深度学习 v2》之预备、线性NN、多层感知机、深度学习计算_第1张图片

《动手学深度学习 v2》之预备、线性NN、多层感知机、深度学习计算_第2张图片
《动手学深度学习 v2》之预备、线性NN、多层感知机、深度学习计算_第3张图片
2.6.2.6. 应用:
https://zh-v2.d2l.ai/chapter_preliminaries/probability.html

2.7. 查阅文档

#2.7.1. 查找模块中的所有函数和类
import torch
print(dir(torch.distributions))

#2.7.2. 查找特定函数和类的用法
help(torch.ones)

3.线性神经网络

回归问题(值回归)

3.1.线性回归

#3.1.2. 矢量化加速
import torch
from d2l import torch as d2l

n = 10000
a = torch.ones([n])
b = torch.ones([n])
#方式一:for-loop
c = torch.zeros(n)
timer = d2l.Timer()
for i in range(n):
    c[i] = a[i] + b[i]
print(f'{timer.stop():.5f} sec')

#方式二:使用重载的+运算符(矢量化加速)
timer.start()
d = a + b
print(f'{timer.stop():.5f} sec')


#3.1.3. 正态分布与平方损失
import math
import numpy as np
def normal(x, mu, sigma):
    p = 1 / math.sqrt(2 * math.pi * sigma**2)
    return p * np.exp(-0.5 / sigma**2 * (x - mu)**2)

# 再次使用numpy进行可视化
x = np.arange(-7, 7, 0.01)

# 均值和标准差对
params = [(0, 1), (0, 2), (3, 1)]
d2l.plot(x, [normal(x, mu, sigma) for mu, sigma in params], xlabel='x',
         ylabel='p(x)', figsize=(4.5, 2.5),
         legend=[f'mean {mu}, std {sigma}' for mu, sigma in params])
d2l.plt.show()

3.2. 线性回归的从零开始实现

import random
import torch
from d2l import torch as d2l
#3.2.1. 生成数据集
def synthetic_data(w, b, num_examples):  #@save
    """生成y=Xw+b+噪声"""
    #feature
    X = torch.normal(0, 1, (num_examples, len(w)))#行:num_examples 列:len(w)
    #label(含 noise)
    y = torch.matmul(X, w) + b #行:num_examples 列:1
    y += torch.normal(0, 0.01, y.shape) #+噪声

    return X, y.reshape((-1, 1))#标量->一维张量

true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = synthetic_data(true_w, true_b, 1000)
#print('features:', features[0],'\nlabel:', labels[0])

d2l.set_figsize()
d2l.plt.scatter(features[:, 1].detach().numpy(), labels.detach().numpy(), 1) #从计算图中detach出来,再转numpy
#d2l.plt.show()

#3.2.2. 读取数据集
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))#索引[0,```,999]
    # 这些样本是随机读取的,没有特定的顺序(通过随机打乱索引实现)
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):#范围:[0, num_examples),步长:batch_size
        batch_indices = torch.tensor(indices[i: min(i + batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices] #每次loop,生成batch_size个features和labels

#batch_size = 10
# for X, y in data_iter(batch_size, features, labels):
#     print(X, '\n', y) #这里打印一次:即batch_size个features和labels
#     break


#3.2.4. 定义模型
def linreg(X, w, b):  #@save
    """线性回归模型"""
    return torch.matmul(X, w) + b #Broadcast
    
#3.2.3. 初始化模型参数
w = torch.normal(0, 0.01, size=(2,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

#3.2.5. 定义损失函数
def squared_loss(y_hat, y):  #@save
    """均方损失"""
    return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2 #shape:(batch_size,1)

#3.2.6. 定义优化算法
#这里params=[w,b]
def sgd(params, lr, batch_size):  #@save
    """小批量随机梯度下降"""
    with torch.no_grad(): #更新时,不参与梯度计算
        for param in params:
            param -= lr * param.grad / batch_size #/batch_size:因为loss是以bach_size计算的
            param.grad.zero_()

#3.2.7. 训练
num_epochs = 3
batch_size = 10
lr = 0.03

net = linreg #model
loss = squared_loss #target

for epoch in range(num_epochs):
    #1)按batch更新:batch_size个feature为一组,一共num_examples/batch_size组
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y)  # 2)X和y的小批量损失
        # 因为l形状是(batch_size,1),而不是一个标量。l中的所有元素被加到一起,
        # 并以此计算关于[w,b]的梯度
        l.sum().backward()
        sgd([w, b], lr, batch_size)  #3)使用参数的梯度更新参数
    with torch.no_grad():
        train_l = loss(net(features, w, b), labels)#4)这里的w, b为该epoch的最后一次更新值
        print(f'epoch {epoch + 1}, loss {float(train_l.mean()):f}')
        
print(f'w的估计误差: {true_w - w.reshape(true_w.shape)}')
print(f'b的估计误差: {true_b - b}')

3.3. 线性回归的简洁实现

import numpy as np
import torch
from torch.utils import data
from d2l import torch as d2l

#3.2.1. 生成数据集
true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = d2l.synthetic_data(true_w, true_b, 1000)

#3.2.2. 读取数据集(key)
def load_array(data_arrays, batch_size, is_train=True):  #@save
    """构造一个PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

batch_size = 10
data_iter = load_array((features, labels), batch_size)
#print(next(iter(data_iter))) #使用iter构造Python迭代器,并使用next从迭代器中获取第一项

#3.2.4. 定义模型
# nn是神经网络的缩写
from torch import nn
##Sequential类可以将多个层串联在一起
net = nn.Sequential(nn.Linear(2, 1)) #第一个指定输入特征形状,即2,第二个指定输出特征形状,输出特征形状为单个标量,因此为1。

#3.2.3. 初始化模型参数
#使用替换方法normal_和fill_来重写参数值
net[0].weight.data.normal_(0, 0.01)
net[0].bias.data.fill_(0)

#3.2.5. 定义损失函数
loss = nn.MSELoss()#默认情况下,它返回所有样本损失的平均值

#3.2.6. 定义优化算法
#指定优化的参数 (可通过net.parameters()从我们的模型中获得)以及优化算法所需的超参数字典
trainer = torch.optim.SGD(net.parameters(), lr=0.03)

#3.2.7. 训练
num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:#1)每次一个batch_size
        l = loss(net(X) ,y)
        trainer.zero_grad()  # 必须在l.backward()前面,否则梯度累计
        l.backward()
        trainer.step()
    #2)验收该epoch更新的效果
    l = loss(net(features), labels)
    print(f'epoch {epoch + 1}, loss {l:f}')

#验证最后的更新好的model
w = net[0].weight.data
print('w的估计误差:', true_w - w.reshape(true_w.shape))
b = net[0].bias.data
print('b的估计误差:', true_b - b)

分类问题(概率回归)

3.4. softmax回归:

softmax:获取一个向量并将其映射为概率(输出类别的概率分布)
交叉熵损失(cross-entropy loss):衡量两个概率分布之间差异
《动手学深度学习 v2》之预备、线性NN、多层感知机、深度学习计算_第4张图片

《动手学深度学习 v2》之预备、线性NN、多层感知机、深度学习计算_第5张图片

3.5. 图像分类数据集

import torch
import torchvision
from torch.utils import data
from torchvision import transforms
from d2l import torch as d2l

d2l.use_svg_display()

# 3.5.1. 读取数据集
# 通过ToTensor实例将图像数据从PIL类型变换成32位浮点数格式,
# 并除以255使得所有像素的数值均在0~1之间
trans = [transforms.ToTensor()]
trans = transforms.Compose(trans)
mnist_train = torchvision.datasets.FashionMNIST(
    root="./data", train=True, transform=trans, download=False)#True
mnist_test = torchvision.datasets.FashionMNIST(
    root="./data", train=False, transform=trans, download=False)

#数据集情况
print(len(mnist_train), len(mnist_test))
print(mnist_train[0][0].shape)  # 每个输入图像的高度和宽度均为28像素,灰度图

def get_fashion_mnist_labels(labels):  #@save
    """返回Fashion-MNIST数据集的文本标签"""
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):  #@save
    """绘制图像列表"""
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize)
    axes = axes.flatten()

    for i, (ax, img) in enumerate(zip(axes, imgs)):
        if torch.is_tensor(img):
            # 图片张量
            ax.imshow(img.numpy())
        else:
            # PIL图片
            ax.imshow(img)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        if titles:
            ax.set_title(titles[i])
    return axes
#batch个数据可视化
X, y = next(iter(data.DataLoader(mnist_train,batch_size=18)))  # 拿第一个batch数据,例如y=tensor([9, 0, 0, 3, 0, 2, 7, 2, 5, 5, 0, 9, 5, 5, 7, 9, 1, 0])
show_images(X.reshape(18, 28, 28), 2, 9, titles=get_fashion_mnist_labels(y))
d2l.plt.show()

#3.5.3. 整合所有组件(key)
def get_dataloader_workers():  #@save
    """使用4个进程来读取数据"""
    return 4


def load_data_fashion_mnist(batch_size, resize=None):  #@save #注意:这里resize的作用可以迎合各种模型的输入要求
    """下载Fashion-MNIST数据集,然后将其加载到内存中"""
    # 3.5.1. 读取数据集
    # 通过ToTensor实例将图像数据从PIL类型变换成32位浮点数格式,
    # 并除以255使得所有像素的数值均在0~1之间
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(
        root="./data", train=True, transform=trans, download=False)#True
    mnist_test = torchvision.datasets.FashionMNIST(
        root="./data", train=False, transform=trans, download=False)

    return (data.DataLoader(mnist_train, batch_size, shuffle=True,
                            num_workers=get_dataloader_workers()),
            data.DataLoader(mnist_test, batch_size, shuffle=False,
                            num_workers=get_dataloader_workers()))

train_iter, test_iter = load_data_fashion_mnist(32, resize=64)#3.5.2. 按小批量方式读取全部数据
timer = d2l.Timer()
for X, y in train_iter:
    print(X.shape, X.dtype, y.shape, y.dtype)#torch.Size([32, 1, 64, 64]) torch.float32 torch.Size([32]) torch.int64
    break #这里只显示一个batch_size的数据
print(f'使用4个进程来读取数据:{4},time:{timer.stop():.2f} sec')

3.6. softmax回归的从零开始实现

import torch
from IPython import display
from d2l import torch as d2l

import torchvision
from torch.utils import data
from torchvision import transforms


class Accumulator:  # @save
    """在n个变量上累加"""

    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


class Animator:  # @save
    """在动画中绘制数据"""

    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(3.5, 2.5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        d2l.use_svg_display()
        self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: d2l.set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)


#bug:
#1,修改进程数:将DataLoader中的num_workers=2,改成num_workers=0,仅执行主进程。运行成功!!!
#2,使用多进程习惯用法:再for循环前加上main函数,成功运行!!!
if __name__ == '__main__':
    #1. 读取数据集
    def get_dataloader_workers():  # @save
        """使用4个进程来读取数据"""
        return 4
    def load_data_fashion_mnist(batch_size, resize=None):  # @save #注意:这里resize的作用可以迎合各种模型的输入要求
        """下载Fashion-MNIST数据集,然后将其加载到内存中"""
        # 3.5.1. 读取数据集
        # 通过ToTensor实例将图像数据从PIL类型变换成32位浮点数格式,
        # 并除以255使得所有像素的数值均在0~1之间
        trans = [transforms.ToTensor()]
        if resize:
            trans.insert(0, transforms.Resize(resize))
        trans = transforms.Compose(trans)
        mnist_train = torchvision.datasets.FashionMNIST(
            root="./data", train=True, transform=trans, download=False)  # True
        mnist_test = torchvision.datasets.FashionMNIST(
            root="./data", train=False, transform=trans, download=False)

        return (data.DataLoader(mnist_train, batch_size, shuffle=True,
                                num_workers=get_dataloader_workers()),
                data.DataLoader(mnist_test, batch_size, shuffle=False,
                                num_workers=get_dataloader_workers()))


    batch_size = 256
    train_iter, test_iter = load_data_fashion_mnist(batch_size) #将全部样本,按batch_size分组


    #3.6.1. 初始化模型参数
    num_inputs = 784 #输入时,将图像展平:1x28x28(神经元)
    num_outputs = 10 #输出类别数:10

    W = torch.normal(0, 0.01, size=(num_inputs, num_outputs), requires_grad=True) #利用高斯随机初始化
    b = torch.zeros(num_outputs, requires_grad=True)

    # 3.6.2. 定义softmax操作
    def softmax(X):
        X_exp = torch.exp(X)
        partition = X_exp.sum(1, keepdim=True)#沿着y轴求和
        return X_exp / partition  # 这里应用了广播机制

    #
    #X = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    # print(X.sum(0, keepdim=True), X.sum(1, keepdim=True))
    #
    # X = torch.normal(0, 1, (2, 5))
    # X_prob = softmax(X)
    # print(X_prob, X_prob.sum(1))

    #3.6.3. 定义模型
    def net(X):
        return softmax(torch.matmul(X.reshape((-1, W.shape[0])), W) + b)#(-1, W.shape[0]):对输入的一个flatten:batch_size x 784(这是区别于线性模型的标志)


    #3.6.4. 定义损失函数
    def cross_entropy(y_hat, y):
        #key:如果真实类别为1,而索引为1对应的估计概率很小就会导致交叉熵损失很大
        #size:[batch_size]
        return - torch.log(y_hat[range(len(y_hat)), y]) #y_hat[range(len(y_hat)), y]:使用y作为y_hat中概率的索引(即拿到y对应类别的估计概率y_hat)


    # cross_entropy解释:使用y作为y_hat中概率的索引
    # y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])#表示两个样本各属于三种类的概率值
    # y = torch.tensor([0, 2]) #比如样本0真实属于0类,样本1真实属于2类
  
    # 比如,y_hat的[0]号样本[0.1, 0.3, 0.6]中取索引为y[0]的概率值,即样本0的属于0类的估计概率0.1
    # y_hat的[1]号样本[0.3, 0.2, 0.5]中取索引为y[1]的概率值,即样本1的属于2类的估计概率0.5
    # print(y_hat[[0, 1], y]) #tensor([0.1000, 0.5000])
    #print(cross_entropy(y_hat, y))#tensor([2.3026, 0.6931]),说明样本0的属于0类的估计误差大


    # 3.6.5. 分类精度
    def accuracy(y_hat, y):  # @save
        """计算预测正确的数量"""
        #y_hat.shape:[batch_size,num_outputs]
        if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: #说明维度大于1,且列数(即预测类别)也大于1
            y_hat = y_hat.argmax(axis=1) #选择预测概率最高的类(返回索引)
        cmp = y_hat.type(y.dtype) == y#步骤:1)统一数据类型:y_hat变成y的数据类型 2)y_hat与y的对应类别(索引)做比较(bool)
        return float(cmp.type(y.dtype).sum())#3)统计估计类别与实际类别(索引)一致的个数

    #print(accuracy(y_hat, y) / len(y))


    # 可以评估在任意模型net的精度
    def evaluate_accuracy(net, data_iter):  #@save
        """计算在指定数据集上模型的精度"""
        if isinstance(net, torch.nn.Module):#说明net是torch.nn.Module实现的
            net.eval()  # 将模型设置为评估模式(不需要计算梯度,更新模型了)
        metric = Accumulator(2)  # metric[0]:正确预测数、metric[0]:预测总数
        with torch.no_grad():
            for X, y in data_iter:
                metric.add(accuracy(net(X), y), y.numel())#每次对应batch_size个样本
        return metric[0] / metric[1]

    print('模型的初始精度:',evaluate_accuracy(net, test_iter))

    #3.6.6. 训练
    def train_epoch_ch3(net, train_iter, loss, updater):  #@save
        """训练模型一个迭代周期(定义见第3章)"""
        if isinstance(net, torch.nn.Module):#说明net是torch.nn.Module实现的
            net.train() # 将模型设置为训练模式(需要计算梯度,更新模型了)

        # metric[0]:训练损失总和、metric[1]:训练准确度总和、metric[2]:样本数
        metric = Accumulator(3)
        for X, y in train_iter:
            # 计算梯度并更新参数
            y_hat = net(X)
            l = loss(y_hat, y)
            if isinstance(updater, torch.optim.Optimizer):#说明updater是torch.optim.Optimizer实现的
                # 使用PyTorch内置的优化器和损失函数
                updater.zero_grad()
                l.mean().backward()
                updater.step()
            else:
                # 使用定制的优化器和损失函数
                l.sum().backward()
                updater(X.shape[0])#X.shape[0]:batch_size
            metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())#每次对应batch_size个样本
        # 返回训练损失和训练精度(epoch)
        return metric[0] / metric[2], metric[1] / metric[2]

    ###key-key-key
    def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):  #@save
        """训练模型(定义见第3章)"""
        animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
                            legend=['train loss', 'train acc', 'test acc'])
        for epoch in range(num_epochs):
            train_metrics = train_epoch_ch3(net, train_iter, loss, updater) #for 训练集表现
            test_acc = evaluate_accuracy(net, test_iter) #for 测试集表现
            animator.add(epoch + 1, train_metrics + (test_acc,)) #每epoch:实时显示训练精度,训练损失和测试精度
            

        train_loss, train_acc = train_metrics
        assert train_loss < 0.5, train_loss
        assert train_acc <= 1 and train_acc > 0.7, train_acc
        assert test_acc <= 1 and test_acc > 0.7, test_acc


    #自定义的优化算法
    lr = 0.1
    def updater(batch_size):
        return d2l.sgd([W, b], lr, batch_size)

    num_epochs = 10
    train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)
    d2l.plt.show()

    #3.6.7. 预测
    def predict_ch3(net, test_iter, n=6):  #@save
        """预测标签(定义见第3章)"""
        for X, y in test_iter:
            break #这里只取一个batch样本
        trues = d2l.get_fashion_mnist_labels(y)
        preds = d2l.get_fashion_mnist_labels(net(X).argmax(axis=1))
        titles = [true +'\n' + pred for true, pred in zip(trues, preds)]
        d2l.show_images(X[0:n].reshape((n, 28, 28)), 1, n, titles=titles[0:n])


    predict_ch3(net, test_iter)
    d2l.plt.show()

3.7. softmax回归的简洁实现

import torch
from torch import nn
from d2l import torch as d2l

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

#3.7.1. 初始化模型参数(key)
# PyTorch不会隐式地调整输入的形状。因此,
# 区别于简单线性回归模型:我们在线性层前定义了展平层(flatten),来调整网络输入的形状
net = nn.Sequential(nn.Flatten(), nn.Linear(784, 10)) #nn.Flatten()【变成 2D tensor】:第零维度保持,例如1x28x28->1x748

def init_weights(m):
    if type(m) == nn.Linear: #Sequential->m->如果是线形层
        nn.init.normal_(m.weight, std=0.01) #->初始化权重

net.apply(init_weights)

#3.7.2. 重新审视Softmax的实现
#在交叉熵损失函数中传递未归一化的预测,并同时计算softmax及其对数
loss = nn.CrossEntropyLoss(reduction='none')

#3.7.3. 优化算法
trainer = torch.optim.SGD(net.parameters(), lr=0.1)

#3.7.4. 训练
num_epochs = 10
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
d2l.plt.show()

4.多层感知机

4.1.多层感知机

#隐藏层:任何像素的重要性取决于该像素的上下文(周围像素的值)
#激活函数:以防止多层感知机退化成线性模型


#4.1.2. 激活函数
import torch
from d2l import torch as d2l

#4.1.2.1. ReLU函数
x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.relu(x)
d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize=(5, 2.5))


y.backward(torch.ones_like(x), retain_graph=True)#等价于#等价于y.backward(torch.ones(len(x))) <=> y.sum().backward()
d2l.plot(x.detach(), x.grad, 'x', 'grad of relu', figsize=(5, 2.5))#绘制ReLU函数的导数
d2l.plt.show()

#4.1.2.2. sigmoid函数
y = torch.sigmoid(x)
d2l.plot(x.detach(), y.detach(), 'x', 'sigmoid(x)', figsize=(5, 2.5))


x.grad.data.zero_()# 清除以前的梯度
y.backward(torch.ones_like(x),retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of sigmoid', figsize=(5, 2.5))


#4.1.2.3. tanh函数
y = torch.tanh(x)
d2l.plot(x.detach(), y.detach(), 'x', 'tanh(x)', figsize=(5, 2.5))


x.grad.data.zero_()# 清除以前的梯度
y.backward(torch.ones_like(x),retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of tanh', figsize=(5, 2.5))

4.2. 多层感知机的从零开始实现

import torch
from torch import nn
from d2l import torch as d2l


import torchvision
from torch.utils import data
from torchvision import transforms
#bug:
#1,修改进程数:将DataLoader中的num_workers=2,改成num_workers=0,仅执行主进程。运行成功!!!
#2,使用多进程习惯用法:再for循环前加上main函数,成功运行!!!
def get_dataloader_workers():  # @save
    """使用0个进程来读取数据"""
    return 0

def load_data_fashion_mnist(batch_size, resize=None):  # @save #注意:这里resize的作用可以迎合各种模型的输入要求
    """下载Fashion-MNIST数据集,然后将其加载到内存中"""
    # 3.5.1. 读取数据集
    # 通过ToTensor实例将图像数据从PIL类型变换成32位浮点数格式,
    # 并除以255使得所有像素的数值均在0~1之间
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(
        root="./data", train=True, transform=trans, download=False)  # True
    mnist_test = torchvision.datasets.FashionMNIST(
        root="./data", train=False, transform=trans, download=False)

    return (data.DataLoader(mnist_train, batch_size, shuffle=True,
                            num_workers=get_dataloader_workers()),
            data.DataLoader(mnist_test, batch_size, shuffle=False,
                            num_workers=get_dataloader_workers()))

batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)  # 将全部样本,按batch_size分组


#4.2.1. 初始化模型参数
num_inputs, num_outputs, num_hiddens = 784, 10, 256

W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad=True) * 0.01)#nn.Parameter可加可不加
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))

W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad=True) * 0.01)
b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))

params = [W1, b1, W2, b2]


#4.2.2. 激活函数
def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X, a) #ReLU(x)=max(0,x)


#4.2.3. 模型
def net(X):
    X = X.reshape((-1, num_inputs)) #(-1, num_inputs):batsize x num_inputs 256 X 784
    H = relu(X@W1 + b1)  # 这里“@”代表矩阵乘法
    return (H@W2 + b2)

#4.2.4. 损失函数
loss = nn.CrossEntropyLoss(reduction='none')

#4.2.5. 训练
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)

#预测
d2l.predict_ch3(net, test_iter)
d2l.plt.show()

4.3. 多层感知机的的简洁实现

import torch
from torch import nn
from d2l import torch as d2l

#4.3.1. 模型(key)
net = nn.Sequential(nn.Flatten(),
                    nn.Linear(784, 256),
                    nn.ReLU(),
                    nn.Linear(256, 10))

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)


#训练
batch_size, lr, num_epochs = 256, 0.1, 10
loss = nn.CrossEntropyLoss(reduction='none')
trainer = torch.optim.SGD(net.parameters(), lr=lr)

train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)


4.4. 模型选择、欠拟合和过拟合

#4.4.4. 多项式回归
import math
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l


#4.4.4.1. 生成数据集

#使用以下三阶多项式来生成训练和测试数据的标签
#https://zh-v2.d2l.ai/chapter_multilayer-perceptrons/underfit-overfit.html

n_train, n_test = 100, 100  # 训练和测试数据集大小

#1)只定义了三阶项,剩下的项,为噪音项
max_degree = 20  # 多项式的最大阶数
true_w = np.zeros(max_degree)  # 分配大量的空间
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])

features = np.random.normal(size=(n_train + n_test, 1))
np.random.shuffle(features) #x.size:(n_train+n_test,1)
poly_features = np.power(features, np.arange(max_degree).reshape(1, -1)) #x^n
for i in range(max_degree):
    poly_features[:, i] /= math.gamma(i + 1)  # gamma(n)=(n-1)!    x^n/gamma(n+1)
# 2)labels的维度:(n_train+n_test,1)
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)# +噪音

# 3) NumPy ndarray转换为tensor
true_w, features, poly_features, labels = [torch.tensor(x, dtype=
    torch.float32) for x in [true_w, features, poly_features, labels]]

print(features[:2], poly_features[:2, :], labels[:2])


#4.4.4.2. 对模型进行训练和测试
def evaluate_loss(net, data_iter, loss):  #@save
    """评估给定数据集上模型的损失"""
    metric = d2l.Accumulator(2)  # 损失的总和,样本数量
    for X, y in data_iter:
        out = net(X)
        y = y.reshape(out.shape)
        l = loss(out, y)
        metric.add(l.sum(), l.numel())
    return metric[0] / metric[1]

def train(train_features, test_features, train_labels, test_labels,
          num_epochs=400):
    loss = nn.MSELoss(reduction='none')
    input_shape = train_features.shape[-1]
    # 不设置偏置,因为我们已经在多项式中实现了它
    net = nn.Sequential(nn.Linear(input_shape, 1, bias=False))

    batch_size = min(10, train_labels.shape[0])
    train_iter = d2l.load_array((train_features, train_labels.reshape(-1,1)),
                                batch_size)
    test_iter = d2l.load_array((test_features, test_labels.reshape(-1,1)),
                               batch_size, is_train=False)

    trainer = torch.optim.SGD(net.parameters(), lr=0.01)
    animator = d2l.Animator(xlabel='epoch', ylabel='loss', yscale='log',xlim=[1, num_epochs], ylim=[1e-3, 1e2],legend=['train', 'test'])

    for epoch in range(num_epochs):
        d2l.train_epoch_ch3(net, train_iter, loss, trainer)
        if epoch == 0 or (epoch + 1) % 20 == 0:
            animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss),
                                     evaluate_loss(net, test_iter, loss)))
    print('weight:', net[0].weight.data.numpy())


#4.4.4.3. 三阶多项式函数拟合(正常)
# 从多项式特征中选择前4个维度,即1,x,x^2/2!,x^3/3!——————————相当于模型拟合四个维度的数据
train(poly_features[:n_train, :4], poly_features[n_train:, :4],
      labels[:n_train], labels[n_train:])

#4.4.4.4. 线性函数拟合(欠拟合)
# 从多项式特征中选择前2个维度,即1和x————————相当于模型拟合两个维度的数据
train(poly_features[:n_train, :2], poly_features[n_train:, :2],
      labels[:n_train], labels[n_train:])

#4.4.4.5. 高阶多项式函数拟合(过拟合)
# 从多项式特征中选取所有维度————————相当于模型拟合所有个维度的数据
train(poly_features[:n_train, :], poly_features[n_train:, :],
      labels[:n_train], labels[n_train:], num_epochs=1500)
d2l.plt.show()

4.5. 权重衰减:正则化技术之一


'''
#4.5.1. 高维线性回归
import torch
from torch import nn
from d2l import torch as d2l

#https://zh-v2.d2l.ai/chapter_multilayer-perceptrons/weight-decay.html
n_train, n_test, num_inputs, batch_size = 20, 100, 200, 5
true_w, true_b = torch.ones((num_inputs, 1)) * 0.01, 0.05
train_data = d2l.synthetic_data(true_w, true_b, n_train)
test_data = d2l.synthetic_data(true_w, true_b, n_test)

train_iter = d2l.load_array(train_data, batch_size)
test_iter = d2l.load_array(test_data, batch_size, is_train=False)


#4.5.2. 从零开始实现
#4.5.2.1. 初始化模型参数
def init_params():
    w = torch.normal(0, 1, size=(num_inputs, 1), requires_grad=True)
    b = torch.zeros(1, requires_grad=True)
    return [w, b]

#4.5.2.2. 定义L2范数惩罚
def l2_penalty(w):
    return torch.sum(w.pow(2)) / 2

#4.5.2.3. 定义训练代码实现
def train(lambd):
    w, b = init_params()
    net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss #lambda X:后面是net(X)
    num_epochs, lr = 100, 0.003
    animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',xlim=[5, num_epochs], legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            # 增加了L2范数惩罚项,(key)
            # 广播机制使l2_penalty(w)成为一个长度为batch_size的向量
            l = loss(net(X), y) + lambd * l2_penalty(w) #lambd:train(lambd), 传入的超参数
            l.sum().backward()
            d2l.sgd([w, b], lr, batch_size)
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss),
                                     d2l.evaluate_loss(net, test_iter, loss)))
    print('w的L2范数是:', torch.norm(w).item())

#4.5.2.4. 忽略正则化直接训练
train(lambd=0)
#4.5.2.5. 使用权重衰减
train(lambd=3)
d2l.plt.show()
'''

#4.5.3. 简洁实现
def train_concise(wd):
    net = nn.Sequential(nn.Linear(num_inputs, 1))
    for param in net.parameters():
        param.data.normal_()#参数初始化

    loss = nn.MSELoss(reduction='none')

    num_epochs, lr = 100, 0.003
    #key区别:之前权重衰减lambd定义在了loss对象中,这里定义在了优化器
    # 权重参数有衰减,偏置参数没有衰减
    trainer = torch.optim.SGD([{"params":net[0].weight,'weight_decay': wd},{"params":net[0].bias}], lr=lr)
    animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',xlim=[5, num_epochs], legend=['train', 'test'])

    for epoch in range(num_epochs):
        for X, y in train_iter:
            trainer.zero_grad()
            l = loss(net(X), y)
            l.mean().backward()
            trainer.step()
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1,
                         (d2l.evaluate_loss(net, train_iter, loss),
                          d2l.evaluate_loss(net, test_iter, loss)))
    print('w的L2范数:', net[0].weight.norm().item())

train_concise(0)
train_concise(3)
d2l.plt.show()

4.6. 暂退法(Dropout):正则化技术之二

#4.6.4. 从零开始实现
import torch
from torch import nn
from d2l import torch as d2l

#随机失活:该函数以dropout的概率丢弃张量输入X中的元素
def dropout_layer(X, dropout):
    assert 0 <= dropout <= 1
    # 在本情况中,所有元素都被丢弃
    if dropout == 1:
        return torch.zeros_like(X)
    # 在本情况中,所有元素都被保留
    if dropout == 0:
        return X
    #key: torch.rand(X.shape) :[0-1]的均匀随机分布,将大于dropout的位置设置 mask值=1
    mask = (torch.rand(X.shape) > dropout).float()
    return mask * X / (1.0 - dropout) #mask * x:类似x[mask],*有利于GPU/CPU计算

# #测试dropout_layer函数
# X= torch.arange(16, dtype = torch.float32).reshape((2, 8))
# print(X)
# print(dropout_layer(X, 0.))
# print(dropout_layer(X, 0.5))
# print(dropout_layer(X, 1.))

#4.6.4.1. 定义模型参数
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

#4.6.4.2. 定义模型
dropout1, dropout2 = 0.2, 0.5
class Net(nn.Module):
    def __init__(self, num_inputs, num_outputs, num_hiddens1, num_hiddens2,is_training = True):
        super(Net, self).__init__()
        self.num_inputs = num_inputs
        self.training = is_training
        self.lin1 = nn.Linear(num_inputs, num_hiddens1)
        self.lin2 = nn.Linear(num_hiddens1, num_hiddens2)
        self.lin3 = nn.Linear(num_hiddens2, num_outputs)
        self.relu = nn.ReLU()

    def forward(self, X):
        H1 = self.relu(self.lin1(X.reshape((-1, self.num_inputs))))
        # 只有在训练模型时才使用dropout
        if self.training == True:
            # 在第一个全连接层之后添加一个dropout层
            H1 = dropout_layer(H1, dropout1)

        H2 = self.relu(self.lin2(H1))
        if self.training == True:
            # 在第二个全连接层之后添加一个dropout层
            H2 = dropout_layer(H2, dropout2)
        out = self.lin3(H2)
        return out

net = Net(num_inputs, num_outputs, num_hiddens1, num_hiddens2)

#4.6.4.3. 训练和测试
num_epochs, lr, batch_size = 10, 0.5, 256
loss = nn.CrossEntropyLoss(reduction='none') #含softmax和 负对数似然计算(对应真实索引的估计概率)

train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

trainer = torch.optim.SGD(net.parameters(), lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
d2l.plt.show()



#4.6.5. 简洁实现
net = nn.Sequential(nn.Flatten(),
        nn.Linear(784, 256),
        nn.ReLU(),
        # 在第一个全连接层之后添加一个dropout层
        nn.Dropout(dropout1),
        #
        nn.Linear(256, 256),
        nn.ReLU(),
        # 在第二个全连接层之后添加一个dropout层
        nn.Dropout(dropout2),
        #
        nn.Linear(256, 10))

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights)


trainer = torch.optim.SGD(net.parameters(), lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

4.7. 前向传播、反向传播和计算图

4.8. 数值稳定性和模型初始化

#4.8.1.1. 梯度消失
import torch
from d2l import torch as d2l

x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.sigmoid(x)
y.backward(torch.ones_like(x))

d2l.plot(x.detach().numpy(), [y.detach().numpy(), x.grad.numpy()],
         legend=['sigmoid', 'gradient'], figsize=(4.5, 2.5))
d2l.plt.show()

#4.8.1.2. 梯度爆炸
M = torch.normal(0, 1, size=(4,4))
print('一个矩阵 \n',M)
for i in range(100):
    M = torch.mm(M,torch.normal(0, 1, size=(4, 4)))

print('乘以100个矩阵后\n', M)

4.9. 环境和分布偏移

4.10. 实战Kaggle比赛:预测房价


#4.10.3. 访问和读取数据集
# 如果没有安装pandas,请取消下一行的注释
# !pip install pandas
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv','585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv','fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

#4.10.1. 下载和缓存数据集(key)
import hashlib
import os
import requests
def download(name, cache_dir=os.path.join('.', 'data')):  #@save
    """下载一个DATA_HUB中的文件,返回本地文件名"""
    ###1)建立文件
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"

    os.makedirs(cache_dir, exist_ok=True)
    url, sha1_hash = DATA_HUB[name]
    fname = os.path.join(cache_dir, url.split('/')[-1])# .\data\kaggle_house_pred_train.csv
    print(fname)

    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')

    ###2)写入数据
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
# print(train_data.shape)#(1460, 81)
# print(test_data.shape)#(1459, 80)
# print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))# 去掉id列
print(all_features.shape)#(2919, 79)


#4.10.4. 数据预处理(key)
#1)处理数值数据:将所有缺失的值替换为相应特征的平均值。将特征重新缩放到零均值和单位方差来标准化数据(这里针对列数据).
#若无法获得测试数据,则可根据训练数据计算均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index #!= 'object',说明是数值
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 在标准化数据之后,所有均值消失,因此我们可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 2)处理离散值数据(比如那里列为 pave ,nan,nan,,,,,含字符串数据)。 这包括诸如“MSZoning”之类的特征。 我们用独热编码替换它们
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
print(all_features.shape) #(2919, 331) 79->331

#从pandas格式中提取NumPy格式,并将其转换为张量表示用于训练
n_train = train_data.shape[0]#1460
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)

#4.10.5. 训练
loss = nn.MSELoss()
in_features = train_features.shape[1] #331
print(in_features)

def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

#对于房价更关心相对误差(y_hat-y)/y, 这里用价格预测的对数来衡量差异.
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值,将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()


#训练函数将借助Adam优化器 (我们将在后面章节更详细地描述它)。 Adam优化器的主要吸引力在于它对初始学习率不那么敏感。
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(),lr = learning_rate,weight_decay = weight_decay)

    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

#4.10.6. K折交叉验证:将训练集拆分出一折验证集,多折训练集
#目的:调model的最佳超参数
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k #1460//k #单折数据size

    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        #1)第i折数据作为验证集
        if j == i:
            X_valid, y_valid = X_part, y_part
        #2)其他折数据作为测试集
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,batch_size):
    train_l_sum, valid_l_sum = 0, 0

    for i in range(k):
        net = get_net()
        data = get_k_fold_data(k, i, X_train, y_train)

        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]

        if i == 0:d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],legend=['train', 'valid'], yscale='log')
        print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, 'f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

#4.10.7. 模型选择(调最佳超参数)
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, 'f'平均验证log rmse: {float(valid_l):f}')
d2l.plt.show()





#4.10.8. 提交Kaggle预测:针对的是测试集
def train_and_pred(train_features, test_features, train_labels, test_data,num_epochs, lr, weight_decay, batch_size):
    net = get_net() #此时这里net,应该是前面调好获得的最优model
    train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse:{float(train_ls[-1]):f}')

    # 将网络应用于测试集。
    preds = net(test_features).detach().numpy()
    print(preds.shape)#(1459, 1)
    print(preds.reshape(1, -1)[0])#[119412.25 154692.89 198602.95 ... 208554.67 107001.15 240521.67]
    # 将其重新格式化以导出到Kaggle
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)


train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)
d2l.plt.show()

5.深度学习计算

4.1.层和块

#5.1. 层和块

import torch
from torch import nn
from torch.nn import functional as F

##########方式1
#nn.Sequential定义了一种特殊的Module
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.rand(2, 20) #batch x size:2 x 20
print(net(X))


##########方式2
#5.1.1. 自定义块
class MLP(nn.Module):
    # 用模型参数声明层。这里,我们声明两个全连接的层
    def __init__(self):
        # 调用MLP的父类Module的构造函数来执行必要的初始化。
        # 这样,在类实例化时也可以指定其他函数参数,例如模型参数params(稍后将介绍)
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # 隐藏层
        self.out = nn.Linear(256, 10)  # 输出层

    # 定义模型的前向传播,即如何根据输入X返回所需的模型输出
    def forward(self, X):
        # 注意,这里我们使用ReLU的函数版本,其在nn.functional模块中定义。
        return self.out(F.relu(self.hidden(X)))

net = MLP()
print(net(X))

##########方式3
#5.1.2. 顺序块
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        #写法1:
        for idx, module in enumerate(args):
            # 这里,module是Module子类的一个实例。我们把它保存在'Module'类的成员
            # 变量_modules中。_module的类型是OrderedDict
            self._modules[str(idx)] = module
        # #写法2:
        # for block in args:
        #     self.__modules[block]=block

    def forward(self, X):
        # OrderedDict保证了按照成员添加的顺序遍历它们
        for block in self._modules.values():
            X = block(X)
        return X

net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10)) #各种module
print(net(X))

##########方式4(自定义程度强)
#5.1.3. 在前向传播函数中执行代码
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 不计算梯度的随机权重参数。因此其在训练期间保持不变
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # 使用创建的常量参数以及relu和mm函数
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # 复用全连接层。这相当于两个全连接层共享参数
        X = self.linear(X)
        # 控制流
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = FixedHiddenMLP()
print(net(X))

##########方式5(自定义程度强)
#混合搭配各种组合块的方法
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP()) #嵌套块
print(chimera(X))


4.2. 参数管理

import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

#5.2.1. 参数访问
print(net[2].state_dict())

#5.2.1.1. 目标参数
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

print(net[2].weight.grad == None)

#5.2.1.2. 一次性访问所有参数¶
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

print(net.state_dict()['2.bias'].data)

#5.2.1.3. 从嵌套块收集参数
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

print(rgnet)

print(rgnet[0][1][0].bias.data)

#5.2.2. 参数初始化
#5.2.2.1. 内置初始化
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
print(net[0].weight.data[0], net[0].bias.data[0])

def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
print(net[0].weight.data[0], net[0].bias.data[0])


def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

#5.2.2.2. 自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
print(net[0].weight[:2])

net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
print(net[0].weight.data[0])

#5.2.3. 参数绑定
# 我们需要给共享层一个名称,以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象,而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

4.3. 延后初始化:

直到数据第一次通过模型传递时,框架才会动态地推断出每个层的大小


#5.3.1. 实例化网络
import tensorflow as tf

net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10),
])

#1)请注意,每个层对象都存在,但权重为空。 使用net.get_weights()将抛出一个错误,因为权重尚未初始化
print([net.layers[i].get_weights() for i in range(len(net.layers))]) #[[], []]

X = tf.random.uniform((2, 20))
net(X)
#2)将数据通过网络,最终使框架初始化参数
print([w.shape for w in net.get_weights()]) #[(20, 256), (256,), (256, 10), (10,)]

4.4. 自定义层

#5.4.1. 不带参数的层
import torch
import torch.nn.functional as F
from torch import nn


class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

layer = CenteredLayer()
print(layer(torch.FloatTensor([1, 2, 3, 4, 5])))

#将层作为组件合并到更复杂的模型中
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
print(Y.mean())

#5.4.2. 带参数的层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

linear = MyLinear(5, 3)
print(linear.weight)
#使用自定义层直接执行前向传播计算
print(linear(torch.rand(2, 5)))

#使用自定义层构建模型
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
print(net(torch.rand(2, 64)))

4.5. 读写文件

#5.5.1. 加载和保存张量
import torch
from torch import nn
from torch.nn import functional as F
#张量
x = torch.arange(4)
torch.save(x, 'x-file')
x2 = torch.load('x-file')
print(x2)

#张量列表
y = torch.zeros(4)
torch.save([x, y],'x-files')
x2, y2 = torch.load('x-files')
print((x2, y2))

#张量字典
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
print(mydict2)

#5.5.2. 加载和保存模型参数
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)
torch.save(net.state_dict(), 'mlp.params')

#实例化了原始多层感知机模型的一个备份,直接读取文件中存储的参数
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
print(clone.eval())

Y_clone = clone(X)
print(Y_clone == Y)

4.6. GPU

!nvidia-smi

#5.6.1. 计算设备
import torch
from torch import nn

print(torch.device('cpu'), torch.device('cuda'), torch.device('cuda:1')) #'cuda':指0号GPU  'cuda:1'指1号GPU
#查询可用gpu的数量
print(torch.cuda.device_count())

def try_gpu(i=0):  #@save
    """如果存在,则返回gpu(i),否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU,如果没有GPU,则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

print(try_gpu(), try_gpu(10), try_all_gpus())

#5.6.2. 张量与GPU
#查询张量所在的设备。 默认情况下,张量是在CPU上创建的
x = torch.tensor([1, 2, 3])
print(x.device)

#5.6.2.1. 存储在GPU上
X = torch.ones(2, 3, device=try_gpu()) #默认0号GPU
print(X)

# Y = torch.rand(2, 3, device=try_gpu(1))
# print(Y)

#5.6.2.2. 复制
#要计算X + Y,我们需要决定在哪里执行这个操作(必须同一个device)
Z = X.cuda(0) #复制:将X传输到第一个GPU并在那里执行操作 例如X.cuda(1)
print(X)
print(Z)

print(X + Z)

print(Z.cuda(0) is Z)#假设变量Z已经存在于第二个GPU上。 它将返回Z,而不会复制并分配新内存
#5.6.2.3. 旁注

#5.6.3. 神经网络与GPU(key)
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu()) #1)注意to(device)的位置#2)涉及参数拷贝(输入、权重偏置参数等)3)做的是推理,前向传播(不是训练,反向传播算梯度)
print(net(X))
#确认模型参数存储在同一个GPU上
print(net[0].weight.data.device)

你可能感兴趣的:(吴恩达《深度学习》和,李沐《动手学深度学习,v2》,深度学习,python)