目录
原代码地址:
修改后代码:
代码学习:
https://github.com/Longcodedao/NAS-With-RL
代码从上面下载后,修改为以下代码后可以运行:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
class Params:
NUM_EPOCHS = 50
ALPHA = 0.005
BATCH_SIZE = 64
HIDDEN_SIZE = 64 # Number of Hidden Units in Controller
BETA = 0.1 # The entropy bonus multiplier
INPUT_SIZE = 3
ACTION_SPACE = 2
NUM_STEPS = 4
GAMMA = 0.99
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))]
)
trainset = torchvision.datasets.MNIST(root='./data', train=True,
download=True, transform=transform)
testset = torchvision.datasets.MNIST(root='./data', train=False,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
shuffle=True, num_workers=2)
testlaoder = torch.utils.data.DataLoader(testset, batch_size=64,
shuffle=False, num_workers=2)
class Controller(nn.Module):
def __init__(self, search_space,
hidden_size=64, max_layer=4, device=''):
super(Controller, self).__init__()
self.search_space = search_space
self.DEVICE = device
self.hidden_size = hidden_size
self.length_search = len(search_space) # num_steps = max_layer * length_search_space
self.list_length = [len(space) for space in search_space.values()]
self.max_layer = max_layer
self.total_layer = torch.randint(1, self.max_layer, (1,)).item() # --------------添加这一行
self.lstm = nn.ModuleList()
self.fc = nn.ModuleList()
self.lstm.append(nn.LSTMCell(self.list_length[-1], self.hidden_size).to(self.DEVICE))
for i in range(1, self.length_search):
self.lstm.append(nn.LSTMCell(self.list_length[i - 1], self.hidden_size).to(self.DEVICE))
for i in range(0, self.length_search):
self.fc.append(nn.Linear(self.hidden_size, self.list_length[i]).to(self.DEVICE))
def init_hidden(self):
h_t = torch.zeros(1, self.hidden_size, dtype=torch.float, device=self.DEVICE)
c_t = torch.zeros(1, self.hidden_size, dtype=torch.float, device=self.DEVICE)
return (h_t, c_t)
def forward(self, input):
# self.total_layer = torch.randint(1, self.max_layer, (1,)).item()
outputs = {}
self.hidden = [self.init_hidden() for _ in range(self.length_search)]
for num_layer in range(self.max_layer):
for i, (key, val) in enumerate(self.search_space.items()):
h_t, c_t = self.hidden[i]
h_t, c_t = self.lstm[i](input, (h_t, c_t))
self.hidden[i] = (h_t, c_t)
output = self.fc[i](h_t)
# print(output)
input = output
if key not in outputs.keys():
outputs[key] = [output]
else:
outputs[key].extend([output])
# print(outputs)`
# for _ in range(self.length_search):
# h_t, c_t = self.hidden[i]
# h_t.detach_()
# c_t.detach_()
# self.hidden[i] = (h_t, c_t)
for i, (key, val) in enumerate(outputs.items()):
outputs[key] = torch.stack(outputs[key]).squeeze(1)
return outputs
# 0: nn.ReLU, 1: nn.Tanh, 2: nn.Sigmoid
search_space = {
"hidden_units": [8, 16, 32, 64],
"activation": [0, 1, 2]
}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
controller = Controller(search_space, max_layer=4, device=device)
print(f"Total Layer: {controller.total_layer}")
print(f"List Length: {controller.list_length}")
print(controller)
input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)
outputs = controller(input)
# print(outputs)
class NASModel(nn.Module):
def __init__(self, architectures, input_size, output_size):
super(NASModel, self).__init__()
self.architectures = architectures
self.length_layers = len(self.architectures['hidden_units'])
self.output_size = output_size
layers = []
for layer in range(self.length_layers):
hidden_units = self.architectures['hidden_units'][layer].item()
activation = self.architectures['activation'][layer].item()
# print(activation)
if (activation == 0):
activation = nn.ReLU()
elif (activation == 1):
activation = nn.Tanh()
elif (activation == 2):
activation = nn.Sigmoid()
if layer == 0:
layers.append(nn.Linear(input_size, hidden_units))
layers.append(activation)
else:
layers.append(nn.Linear(self.architectures['hidden_units'][layer - 1].item(),
hidden_units))
layers.append(activation)
layers.append(nn.Linear(self.architectures['hidden_units'][self.length_layers - 1].item(), self.output_size))
layers.append(nn.Softmax(dim=1))
# print(layers)
self.model = nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
from torch.distributions import Categorical
from torch.nn.functional import one_hot, log_softmax, softmax, normalize
architecture = {}
episode_total_log_probs = {}
controller = Controller(search_space, max_layer=4, device=device)
episode_logits = controller(input)
print(f"Number of layers is: {controller.total_layer}")
for key, space in search_space.items():
logits = episode_logits[key]
action_index = Categorical(logits=logits).sample().unsqueeze(0) # ------------unsqueeze的1改为0
# print(action_index)
actions_space = torch.tensor([space] * controller.total_layer).to(device)
action = torch.gather(actions_space, 1, action_index).to(device)
architecture[key] = action.squeeze(0) # squeeze的1改为0
# print(action_index.int().squeeze(1))
mask = one_hot(action_index, num_classes=len(space))
episode_log_probs = torch.sum(mask.float() * log_softmax(logits, dim=1), dim=1)
episode_total_log_probs[key] = episode_log_probs
print(architecture)
print(episode_total_log_probs)
model = NASModel(architecture, 784, 10)
print(model)
from torch.distributions import Categorical
from torch.nn.functional import one_hot, log_softmax, softmax, normalize
import torch.optim as optim
import tqdm
def play_episode(controller):
architecture = {}
episode_total_log_probs = {}
input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)
# print(controller)
episode_logits = controller(input)
for key, space in search_space.items():
logits = episode_logits[key]
action_index = Categorical(logits=logits).sample().unsqueeze(0) # unsqueeze的1改为0
actions_space = torch.tensor([space] * controller.total_layer).to(device)
action = torch.gather(actions_space, 1, action_index).to(device)
architecture[key] = action.squeeze(0) # squeeze的1改为0
# print(action_index.int().squeeze(1))
mask = one_hot(action_index, num_classes=len(space))
episode_log_probs = torch.sum(mask.float() * log_softmax(logits, dim=1), dim=1)
episode_total_log_probs[key] = episode_log_probs
model = NASModel(architecture, 784, 10).to(device)
print(f'{model}\n')
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
for epoch in range(10):
model.train()
running_loss = 0.0
for i, data in enumerate(trainloader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
inputs = inputs.view(-1, 784)
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
running_loss /= len(trainloader)
print(f"Epoch {epoch + 1}: Loss = {running_loss}")
model.eval()
correct = 0
total = 0
with torch.no_grad():
for data in testlaoder:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images.view(-1, 784))
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
acc = 100 * correct / total
print('Accuracy of the network on the 10000 test images: {}'.format(acc))
# compute the reward
reward = acc
reward = torch.tensor(reward, device=device).detach()
sum_weighted_log_probs = {}
sum_weighted_log_probs['hidden_units'] = torch.sum(-episode_total_log_probs['hidden_units'] * reward).unsqueeze(0)
sum_weighted_log_probs['activation'] = torch.sum(-episode_total_log_probs['activation'] * reward).unsqueeze(0)
sum_weighted_loss = sum_weighted_log_probs['hidden_units'] + \
sum_weighted_log_probs['activation']
return sum_weighted_loss, episode_total_log_probs, reward
controller = Controller(search_space, max_layer = 4, device = device)
print(controller)
optimizer = optim.Adam(controller.parameters(), lr = 0.001)
total_rewards = []
controller.train()
for epoch in range(10):
optimizer.zero_grad()
epoch_log_probs = torch.empty((0,), device = device)
for i in range(3):
(sum_weighted_loss, episode_logits,
reward) = play_episode(controller)
print(sum_weighted_loss)
epoch_log_probs = torch.cat((epoch_log_probs, sum_weighted_loss))
loss = torch.mean(epoch_log_probs)
loss.backward()
optimizer.step()
# for name, param in controller.named_parameters():
# print(name, param.grad)
print(f"Loss in {epoch} is: {loss}")
# 日期: 2024/1/23 21:34
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
from torch.distributions import Categorical
from torch.nn.functional import one_hot, log_softmax, softmax, normalize
import torch.optim as optim
import tqdm
class Params:
NUM_EPOCHS = 50
ALPHA = 0.005
BATCH_SIZE = 64
HIDDEN_SIZE = 64 # Number of Hidden Units in Controller
BETA = 0.1 # The entropy bonus multiplier
INPUT_SIZE = 3
ACTION_SPACE = 2
NUM_STEPS = 4
GAMMA = 0.99
# 设置数据转换函数
# Compose()将多个transforms合并,参数是由多个transform对象组合成的列表
# ToTensor()转化为tensor格式图片。输入参数,自动调用__call__方法,把图片变成tensor格式
# Normalize()实例化,输入了均值mean和方差std两个参数,``output[channel] = (input[channel] - mean[channel]) / std[channel]``
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))]
)
# 准备数据集
# 参数train为TRUE则返回训练集,为FALSE则返回测试集,download设置为TRUE则自动从网上下载
train_set = torchvision.datasets.MNIST(root='./data', train=True,
download=True, transform=transform)
test_set = torchvision.datasets.MNIST(root='./data', train=False,
download=True, transform=transform)
# 用dataloader加载数据集
# 参数:dataset即为数据集;batch_size为多少个为一组;shuffle为TRUE时每次加载顺序不同,为FALSE时每次加载数据集的顺序相同
# drop_last默认为false,为TRUE时最后一组数据不满batch_size时舍去,为FALSE时不舍
# num_workers使用多少个子进程来加载数据。 0表示数据将在主进程中加载。 (默认值:0)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=64, shuffle=False)
# 定义控制器,LSTM网络
class Controller(nn.Module):
def __init__(self, search_space,
hidden_size=64, max_layer=4, device=''):
super(Controller, self).__init__()
self.search_space = search_space # 包括隐藏单元的数量和激活函数的种类
self.DEVICE = device # 指定是在gpu还是cpu上
self.hidden_size = hidden_size # 控制器中隐藏单元的数量
# 搜索长度为搜索空间的大小:2 # num_steps = max_layer * length_search_space= 4 * 2
self.length_search = len(search_space) # num_steps = max_layer * length_search_space
self.list_length = [len(space) for space in search_space.values()] # 搜索空间中每个类别的列表长度:[隐藏单元数量的种类, 激活函数种类]:[4, 3]
self.max_layer = max_layer # 最大层数:4
# 总层数:生成1到4之间的随机整数,不包括4,维度为1*1, 类型为tensor,然后用item读取tensor值,最终返回一个int
self.total_layer = torch.randint(1, self.max_layer, (1,)).item()
# 在列表中保存子模块
self.lstm = nn.ModuleList()
self.fc = nn.ModuleList()
# 添加LSTM子模块,input_size = 输入大小(输入x中预期特征的数量,这里是激活函数种类3个),hidden_size = 隐藏特征数量(隐藏状态h的特征数量)
self.lstm.append(nn.LSTMCell(self.list_length[-1], self.hidden_size).to(self.DEVICE))
# 继续添加LSTM子模块,最终的lstm模块包含两层LSTM,第一层输入特征为激活函数类别个数,第二层为隐藏单元个数,对输入序列应用LSTM RNN
for i in range(1, self.length_search):
self.lstm.append(nn.LSTMCell(self.list_length[i - 1], self.hidden_size).to(self.DEVICE))
# 添加self.length_search层(2层)全连接层
for i in range(0, self.length_search):
# linear参数:输入特征大小,输出特征大小。作用:对传入数据应用线性变换
self.fc.append(nn.Linear(self.hidden_size, self.list_length[i]).to(self.DEVICE))
# 这个方法是用来初始化隐藏状态(h_t)和单元状态(c_t)的。
# 这些状态是 LSTM 的内部状态,用来记录和处理序列信息。得到的状态矩阵的形状是 (1, self.hidden_size),并且他们都被初始化为全0.
def init_hidden(self):
h_t = torch.zeros(1, self.hidden_size, dtype=torch.float, device=self.DEVICE)
c_t = torch.zeros(1, self.hidden_size, dtype=torch.float, device=self.DEVICE)
return h_t, c_t
# 前向传播
def forward(self, input):
# outputs用来存放搜索空间中每个类别内容的概率分布。这个字典的每一个键值对表示搜索空间中的一个元素经过LSTM及全连接层处理后的输出序列。
outputs = {}
# 对self.length_search个隐藏层状态进行初始化。
self.hidden = [self.init_hidden() for _ in range(self.length_search)]
# 对于每一层来说
for num_layer in range(self.max_layer):
# 先按顺序遍历self.search_space的每个元素,这个元素在这里被命名为(key, val)。然后,取出与i索引相对应的隐藏状态(h_t, c_t),计算LSTM层的输出。
for i, (key, val) in enumerate(self.search_space.items()):
h_t, c_t = self.hidden[i]
# lstm的输入是输入数据input和当前的隐藏状态 (h_t, c_t)。输出是新的隐藏状态和单元状态 (h_t, c_t),再赋值给与i索引对应的隐藏状态。
h_t, c_t = self.lstm[i](input, (h_t, c_t))
self.hidden[i] = (h_t, c_t)
# 新的隐藏状态h_t经过全连接层(self.fc[i])计算,得到output。这个output又被作为下一个LSTM层的输入。
output = self.fc[i](h_t)
# print(output)
input = output
# 全连接层的输出被添加到outputs字典的对应键key下。如果key在outputs中不存在,那么就创建一个新的键值对;否则,就在已有的键值对后添加新的输出。
if key not in outputs.keys():
outputs[key] = [output]
else:
outputs[key].extend([output])
# print(outputs)`
# for _ in range(self.length_search):
# h_t, c_t = self.hidden[i]
# h_t.detach_()
# c_t.detach_()
# self.hidden[i] = (h_t, c_t)
# 整理outputs,将每一个 key 对应的一系列 tensor 堆叠起来,并压缩掉所有不必要的维度,让每个 key 对应一个形状更加整洁、方便处理的 tensor。
for i, (key, val) in enumerate(outputs.items()):
outputs[key] = torch.stack(outputs[key]).squeeze(1)
return outputs
# 神经网络架构搜索部分,输出一个搜索完毕的DNN模型model
class NASModel(nn.Module):
def __init__(self, architectures, input_size, output_size):
super(NASModel, self).__init__()
self.architectures = architectures # 搜索空间的类别及其采取的动作
# {'hidden_units': tensor([32, 64, 64, 8], device='cuda:0'),
# 'activation': tensor([1, 2, 0, 1], device='cuda:0')}
self.length_layers = len(self.architectures['hidden_units']) # 隐藏层单元的长度,及隐藏层数量
self.output_size = output_size # 输入大小784,输出大小10
layers = []
# 对每个隐藏层而言,将每个隐藏层的单元个数和激活函数赋给每层
for layer in range(self.length_layers):
hidden_units = self.architectures['hidden_units'][layer].item()
activation = self.architectures['activation'][layer].item()
# print(activation)
if activation == 0:
activation = nn.ReLU()
elif activation == 1:
activation = nn.Tanh()
elif activation == 2:
activation = nn.Sigmoid()
# 对于第一层,输入为输入大小,输出为第一层隐藏单元数量
if layer == 0:
layers.append(nn.Linear(input_size, hidden_units))
layers.append(activation)
# 对于其他各层,输入为上一层隐藏单元数量,输出为当前隐藏层数量
else:
layers.append(nn.Linear(self.architectures['hidden_units'][layer - 1].item(),
hidden_units))
layers.append(activation)
# 最后一个线性层,输入为最后一层单元数量,输出为输出大小
layers.append(nn.Linear(self.architectures['hidden_units'][self.length_layers - 1].item(), self.output_size))
# 加上softmax层,Softmax函数是将多分类输出值转换为概率分布的函数,它可以将输出值范围映射到 [0, 1],并且约束输出值的和为1
layers.append(nn.Softmax(dim=1))
# print(layers)
# 用sequential将layers构建到一起
self.model = nn.Sequential(*layers)
def forward(self, x):
# 输入经过创建好的模型的前向传播,返回输出
return self.model(x)
def play_episode(controller):
# 定义字典,用来储存搜索的结果,隐藏单元数和激活函数种类
architecture = {} # 搜索空间中类别采用的动作,如{'hidden_units': tensor([64, 8, 8, 64]), 'activation': tensor([0, 0, 1, 1])}
# 每一个动作的概率都被计算并添加到episode_total_log_probs字典中。
episode_total_log_probs = {}
# {'hidden_units': tensor([[-2.5390, 0.0000, 0.0000, -3.0257]], grad_fn=),
# 'activation': tensor([[-2.1069, -2.2470, 0.0000]], grad_fn=)}
input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)
# 控制器的输出,是一个字典,这个字典的每一个键值对表示搜索空间中的一个元素经过LSTM及全连接层处理后的输出序列,是一个概率。
# 通过输入数据input计算得到历次操作的概率分布
episode_logits = controller(input)
# episode_logits: {'hidden_units': tensor([[ 0.0253, -0.0500, -0.0533, -0.0923],
# [-0.0050, -0.0620, -0.0510, -0.0730],
# [-0.0128, -0.0644, -0.0466, -0.0669],
# [-0.0160, -0.0668, -0.0416, -0.0650]], grad_fn=),
# 'activation': tensor([[-0.0373, 0.0469, -0.0961],
# [-0.0274, 0.0363, -0.0952],
# [-0.0214, 0.0308, -0.0930],
# [-0.0175, 0.0282, -0.0909]], grad_fn=)}
# 对于搜索空间的每一个元素
# 隐藏单元的数量, 激活函数的类别
for key, space in search_space.items():
logits = episode_logits[key]
# 从对应概率中通过采样取出一个动作,创建由probs或logits参数化的分类分布(但不能同时使用两者)。
action_index = Categorical(logits=logits).sample().unsqueeze(0)
# action_index: tensor([[0, 0, 3, 0]])
# 为每一个动作创建一个动作空间,其中的值为搜索空间的键对应的候选值
actions_space = torch.tensor([space] * controller.total_layer).to(device)
# actions_space: tensor([[ 8, 16, 32, 64],
# [ 8, 16, 32, 64],
# [ 8, 16, 32, 64]])
# 根据action_index从动作空间中取出对应的动作。第一次输出隐藏单元数量的action,第二次输出激活函数类别的tensor
action = torch.gather(actions_space, 1, action_index).to(device)
# action:循环的第一次输出为tensor([[ 8, 8, 64, 8]]),第二次为tensor([[0, 0, 0, 2]])
# 每一个动作都被添加到architecture字典
architecture[key] = action.squeeze(0)
# print(action_index.int().squeeze(1))
# 代码使用了one_hot函数和log_softmax函数来计算每个操作的one-hot向量和对数概率。
mask = one_hot(action_index, num_classes=len(space))
episode_log_probs = torch.sum(mask.float() * log_softmax(logits, dim=1), dim=1)
# episode_log_probs: tensor([[-3.3254, 0.0000, -1.1439]], grad_fn=)
# 每一个动作的对数概率都被计算并添加到episode_total_log_probs字典中。
episode_total_log_probs[key] = episode_log_probs
# {'hidden_units': tensor([[-4.1869, 0.0000, 0.0000, -1.3777]], grad_fn=),
# 'activation': tensor([[-3.3254, 0.0000, -1.1439]], grad_fn=)}
# 生成一个DNN网络
model = NASModel(architecture, 784, 10).to(device)
print(f'{model}\n')
# 定义损失函数CrossEntropyLoss
criterion = nn.CrossEntropyLoss()
# 使用optim时,必须构造一个优化器对象,它将保存当前状态,并将根据计算的梯度更新参数。
# 要构造一个Optimizer,你必须给它一个可迭代对象,其中包含要优化的参数(都应该是变量s)。然后,您可以指定特定于优化器的选项,如学习率、权重衰减等。
# 定义优化器SGD
optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
# 训练DNN
for epoch in range(10):
model.train()
running_loss = 0.0
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# 在每次更新权重前,先清零所有被优化变量(通常是模型的参数)的梯度
optimizer.zero_grad()
# forward + backward + optimize
inputs = inputs.view(-1, 784)
outputs = model(inputs)
loss = criterion(outputs, labels) # 计算损失
loss.backward() # 反向传播
optimizer.step() # 用优化器优化参数
# print statistics
running_loss += loss.item()
running_loss /= len(train_loader)
print(f"Epoch {epoch + 1}: Loss = {running_loss}")
model.eval() # 设置模式为评估,开始测试步骤
# 以测试集上的损失或者正确率来判断模型是否训练的好
# 验证集与测试集不一样的,验证集是在训练中用的,反正模型过拟合,测试集是在模型完全训练好后使用的
# 验证集用来调整超参数,相当于真题,测试集是考试
correct = 0 # 验证集准确率
total = 0 # 验证集数量
# 只需要进行测试,不需要对梯度进行调整,所以设置下面这行
with torch.no_grad():
for data in test_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images.view(-1, 784))
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
acc = 100 * correct / total
print('Accuracy of the network on the 10000 test images: {}'.format(acc))
# compute the reward
reward = acc
reward = torch.tensor(reward, device=device).detach()
sum_weighted_log_probs = {}
sum_weighted_log_probs['hidden_units'] = torch.sum(-episode_total_log_probs['hidden_units'] * reward).unsqueeze(0)
sum_weighted_log_probs['activation'] = torch.sum(-episode_total_log_probs['activation'] * reward).unsqueeze(0)
sum_weighted_loss = sum_weighted_log_probs['hidden_units'] + \
sum_weighted_log_probs['activation']
return sum_weighted_loss, episode_total_log_probs, reward
# 0: nn.ReLU, 1: nn.Tanh, 2: nn.Sigmoid
search_space = {
"hidden_units": [8, 16, 32, 64],
"activation": [0, 1, 2]
}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
controller = Controller(search_space, max_layer=4, device=device)
input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)
outputs = controller(input)
optimizer = optim.Adam(controller.parameters(), lr=0.001)
total_rewards = []
controller.train()
for epoch in range(10):
optimizer.zero_grad()
epoch_log_probs = torch.empty((0,), device=device)
for i in range(3):
(sum_weighted_loss, episode_logits,
reward) = play_episode(controller)
print(sum_weighted_loss)
epoch_log_probs = torch.cat((epoch_log_probs, sum_weighted_loss))
loss = torch.mean(epoch_log_probs)
loss.backward()
optimizer.step()
# for name, param in controller.named_parameters():
# print(name, param.grad)
print(f"Loss in {epoch} is: {loss}")