版权声明:本文为CSDN博主「难受啊!马飞…」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_33328642/article/details/123788966
首先上完整的代码。
这个代码是大连理工的一个小姐姐提供的。小姐姐毕竟是小姐姐,心细如丝,把理论讲的很清楚。但是代码我没怎么听懂。小姐姐在B站的视频可以给大家提供一下。不过就小姐姐这个名字,其实我是怀疑她是抠脚大汉,女装大佬。
不说了,先上完整的代码吧
import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torchvision.transforms import InterpolationMode
env = gym.make(‘SpaceInvaders-v0’).unwrapped
# if gpu is to be used
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
######################################################################
# Replay Memory
Transition = namedtuple(‘Transition’,
(‘state’, ‘action’, ‘next_state’, ‘reward’))
class ReplayMemory(object):
def __init__(self, capacity):
self.memory = deque([], maxlen=capacity)
def push(self, *args):
self.memory.append(Transition(*args))
def sample(self, batch_size):
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
######################################################################
# DQN algorithm
class DQN(nn.Module):
def __init__(self, h, w, outputs):
super(DQN, self).__init__()
self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
self.bn2 = nn.BatchNorm2d(64)
self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
self.bn3 = nn.BatchNorm2d(64)
def conv2d_size_out(size, kernel_size, stride):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w, 8, 4), 4, 2), 3, 1)
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h, 8, 4), 4, 2), 3, 1)
linear_input_size = convw * convh * 64
self.l1 = nn.Linear(linear_input_size, 512)
self.l2 = nn.Linear(512, outputs)
def forward(self, x):
x = x.to(device)
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
x = F.relu(self.l1(x.view(x.size(0), -1)))
return self.l2(x.view(-1, 512))
######################################################################
# Input extraction
resize = T.Compose([T.ToPILImage(),
T.Grayscale(num_output_channels=1),
T.Resize((84, 84), interpolation=InterpolationMode.BICUBIC),
T.ToTensor()])
def get_screen():
# Transpose it into torch order (CHW).
screen = env.render(mode=‘rgb_array’).transpose((2, 0, 1))
screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
screen = torch.from_numpy(screen)
# Resize, and add a batch dimension (BCHW)
return resize(screen).unsqueeze(0)
######################################################################
# Training
# 参数和网络初始化
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 10000
TARGET_UPDATE = 10
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape
# Get number of actions from gym action space
n_actions = env.action_space.n
policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(100000)
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END)
math.exp(-1. steps_done / EPS_DECAY)
steps_done += 1
if sample > eps_threshold:
with torch.no_grad():
return policy_net(state).max(1)[1].view(1, 1)
else:
return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
episode_durations = []
def plot_durations():
plt.figure(1)
plt.clf()
durations_t = torch.tensor(episode_durations, dtype=torch.float)
plt.title(‘Training…’)
plt.xlabel(‘Episode’)
plt.ylabel(‘Duration’)
plt.plot(durations_t.numpy())
# Take 100 episode averages and plot them too
if len(durations_t) >= 100:
means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
means = torch.cat((torch.zeros(99), means))
plt.plot(means.numpy())
plt.pause(0.001) # pause a bit so that plots are updated
def optimize_model():
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)
batch = Transition(zip(transitions))
# Compute a mask of non-final states and concatenate the batch elements
# (a final state would've been the one after which simulation ended)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
device=device, dtype=torch.bool)
non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
state_action_values = policy_net(state_batch).gather(1, action_batch)
next_state_values = torch.zeros(BATCH_SIZE, device=device)
next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
expected_state_action_values = (next_state_values * GAMMA) + reward_batch
# Compute Huber loss
criterion = nn.MSELoss()
loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
# Optimize the model
optimizer.zero_grad()
loss.backward()
for param in policy_net.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
def random_start(skip_steps=30, m=4):
env.reset()
state_queue = deque([], maxlen=m)
next_state_queue = deque([], maxlen=m)
done = False
for i in range(skip_steps):
if (i+1) <= m:
state_queue.append(get_screen())
elif m < (i + 1) <= 2*m:
next_state_queue.append(get_screen())
else:
state_queue.append(next_state_queue[0])
next_state_queue.append(get_screen())
action = env.action_space.sample()
_, _, done, _ = env.step(action)
if done:
break
return done, state_queue, next_state_queue
######################################################################
# Start Training
num_episodes = 10000
m = 4
for i_episode in range(num_episodes):
# Initialize the environment and state
done, state_queue, next_state_queue = random_start()
if done:
continue
state = torch.cat(tuple(state_queue), dim=1)
for t in count():
reward = 0
m_reward = 0
# 每m帧完成一次action
action = select_action(state)
for i in range(m):
_, reward, done, _ = env.step(action.item())
if not done:
next_state_queue.append(get_screen())
else:
break
m_reward += reward
if not done:
next_state = torch.cat(tuple(next_state_queue), dim=1)
else:
next_state = None
m_reward = -150
m_reward = torch.tensor([m_reward], device=device)
memory.push(state, action, next_state, m_reward)
state = next_state
optimize_model()
if done:
episode_durations.append(t + 1)
plot_durations()
break
# Update the target network, copying all weights and biases in DQN
if i_episode % TARGET_UPDATE == 0:
target_net.load_state_dict(policy_net.state_dict())
torch.save(policy_net.state_dict(), 'weights/policy_net_weights_{0}.pth'.format(i_episode))
print(‘Complete’)
env.close()
torch.save(policy_net.state_dict(), ‘weights/policy_net_weights.pth’)
改代码中使用具名元组namedtuple()定义一个Transition ,用于存储agent与环境交互的(s,a,r,s_)
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
这个具名元组很简单
举个例子:
Student = namedtuple('Student', ('name', 'gender'))
s = Student('小花', '女')#给属性赋值
# 属性访问,有多种方法访问属性
第一种方法
print(s.name)
print(s.gender)
‘’’
小花
女
‘’‘
第二种方法
print(s[0])
print(s[1])
’‘’
小花
女
‘’‘
还可以迭代
for i in s:
print(i)
’‘’
小花
女
‘’'
class ReplayMemory(object):
def __init__(self, capacity):
self.memory = deque([], maxlen=capacity)#deque是为了实现插入和删除操作的双向列表,适用于队列和栈:
def push(self, *args):
self.memory.append(Transition(*args))
def sample(self, batch_size):
return random.sample(self.memory, batch_size)#使用random.sample从memory中随机抽取batch_size个数据
def __len__(self):
return len(self.memory)
class DQN(nn.Module): def __init__(self, h, w, outputs): super(DQN, self).__init__() self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)#设置第一个卷积层 self.bn1 = nn.BatchNorm2d(32)#设置第一个卷积层的偏置 self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)#设置第二个卷积层 self.bn2 = nn.BatchNorm2d(64)#设置第2个卷积层的偏置 self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)#设置第3个卷积层 self.bn3 = nn.BatchNorm2d(64)#设置第3个卷积层的偏置 def conv2d_size_out(size, kernel_size, stride): return (size - (kernel_size - 1) - 1) // stride + 1 convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w, 8, 4), 4, 2), 3, 1)#,输入84 宽 7 convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h, 8, 4), 4, 2), 3, 1)#,输入84 高 7 linear_input_size = convw * convh * 64 #计算最终的尺寸,因为最后的feature map的尺寸是7*7*64,如果拉长为1*n,则是7*7*64 = 3136 self.l1 = nn.Linear(linear_input_size, 512)#这边就是先从3136到512.也就是全连接层的神经元的个数,说实话,这个方法好low self.l2 = nn.Linear(512, outputs)#最后模型输出为2,两个动作么。
def forward(self, x): x = x.to(device) x = F.relu(self.bn1(self.conv1(x)))#用激活函数处理C1 x = F.relu(self.bn2(self.conv2(x)))#用激活函数处理C2 x = F.relu(self.bn3(self.conv3(x)))#用激活函数处理C3 x = F.relu(self.l1(x.view(x.size(0), -1)))#将第3次卷积的输出拉伸为一行 return self.l2(x.view(-1, 512))#-1表示不知道数据由多少行,但是直到最后的数据一定是512列
这是一个常规的使用pytorch搭建网络模型的框架,相信大家都懂。而且我在里面也注释了。
需要注意的一点是:
class DQN(nn.Module): def __init__(self, h, w, outputs): super(DQN, self).__init__() self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)#设置第一个卷积层 self.bn1 = nn.BatchNorm2d(32)#设置第一个卷积层的偏置 self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)#设置第二个卷积层 self.bn2 = nn.BatchNorm2d(64)#设置第2个卷积层的偏置 self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)#设置第3个卷积层 self.bn3 = nn.BatchNorm2d(64)#设置第3个卷积层的偏置
def conv2d_size_out(size, kernel_size, stride): return (size - (kernel_size - 1) - 1) // stride + 1 convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w, 8, 4), 4, 2), 3, 1)#,输入84 宽 7 convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h, 8, 4), 4, 2), 3, 1)#,输入84 高 7 linear_input_size = convw * convh * 64 #计算最终的尺寸,因为最后的feature map的尺寸是7*7*64,如果拉长为1*n,则是7*7*64 = 3136 self.l1 = nn.Linear(linear_input_size, 512)#这边就是先从3136到512.也就是全连接层的神经元的个数,说实话,这个方法好low self.l2 = nn.Linear(512, outputs)#最后模型输出为2,两个动作么。 def forward(self, x): #x = x.to(device) x = F.relu(self.bn1(self.conv1(x)))#用激活函数处理C1 x = F.relu(self.bn2(self.conv2(x)))#用激活函数处理C2 x = F.relu(self.bn3(self.conv3(x)))#用激活函数处理C3 x = F.relu(self.l1(x.view(x.size(0), -1)))#将第3次卷积的输出拉伸为一行 return self.l2(x.view(-1, 512))#-1表示不知道数据由多少行,但是直到最后的数据一定是512列
policy_net = DQN(84, 84, 2)#Q
x = torch.rand(32,4,84, 84)
xout = policy_net(x)
print(xout.size())
#[32,2]
print(xout)
tensor([[ 3.4981e-02, 3.1048e-02],
[ 1.4112e-01, -5.2676e-02],
[-3.3868e-01, 3.9583e-02],
[ 7.5908e-02, -1.2230e-01],
[ 1.4027e-01, -1.7528e-02],
[-1.0966e-02, 6.2111e-02],
[-2.2511e-02, -6.1829e-02],
[ 3.2599e-02, -8.9155e-02],
[ 9.7833e-02, -5.0325e-02],
[-6.4633e-02, -8.8093e-02],
[-4.3771e-02, 1.5452e-01],
[-1.7478e-01, -1.3224e-01],
[ 1.9658e-02, 8.1575e-03],
[-1.6989e-01, -6.6487e-03],
[-1.6566e-01, -1.0833e-01],
[-9.5961e-02, 1.1235e-02],
[ 1.0005e-01, -1.1150e-02],
[ 1.8165e-02, 9.9491e-03],
[-2.3947e-01, 9.7802e-02],
[-5.2116e-02, 4.8583e-02],
[ 2.2504e-02, 3.8262e-04],
[-1.1822e-01, -2.0696e-01],
[-1.4129e-01, -1.9254e-01],
[-2.2170e-01, -1.2232e-01],
[ 3.3542e-02, 3.3005e-03],
[ 1.5150e-01, 1.5330e-01],
[-2.3675e-01, -2.4939e-01],
[-1.0502e-01, 7.2696e-02],
[-1.3213e-01, 1.5113e-01],
[ 6.1988e-02, 2.5367e-02],
[-4.2924e-01, -4.0167e-02],
[ 5.1474e-02, 2.6885e-01]], grad_fn=<AddmmBackward0>)
resize = T.Compose([T.ToPILImage(),
T.Grayscale(num_output_channels=1),
T.Resize((84, 84), interpolation=InterpolationMode.BICUBIC),
T.ToTensor()])
#Compose法是将多种变换组合在一起。在这个步骤中,有Resize,灰度处理,
#ToTensor将PILImage转变为torch.FloatTensor的数据形式
#ToPILImage将shape为(C,H,W)的Tensor或shape为(H,W,C)的numpy.ndarray转换成PIL.Image,值不变
def get_screen():
#截取游戏的屏幕,用于做训练数据的状态
# Transpose it into torch order (CHW).
screen = env.render(mode='rgb_array').transpose((2, 0, 1))
#env.render扮演图像引擎的作用,以便直观地显示当前环境。transpose将图像的通道数换到最前面
screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
#ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快。
screen = torch.from_numpy(screen)#即 从numpy.ndarray创建一个张量。
# Resize, and add a batch dimension (BCHW)
return resize(screen).unsqueeze(0)#在第0维度增加一个维度,让图像从chw变成bchw。其中b表示批次
# 参数和网络初始化
BATCH_SIZE = 32#从transition提取样本的批次大小
GAMMA = 0.99#衰减系数
EPS_START = 1.0#贪婪参数初始值
EPS_END = 0.1#贪婪参数最小值
EPS_DECAY = 10000#贪婪参数变化次数
TARGET_UPDATE = 10#target net更新次数
init_screen = get_screen()#采集游戏画面,尺寸[32,4,84,84],第一个是批次的大小,第二个图像数量,最后两个是图像尺寸
_, _, screen_height, screen_width = init_screen.shape#得到画面的尺寸:宽高
n_actions = env.action_space.n#获取游戏的动作空间,左右两个
#初始化模型
policy_net = DQN(screen_height, screen_width, n_actions).to(device)#Q
target_net = DQN(screen_height, screen_width, n_actions).to(device)#T
target_net.load_state_dict(policy_net.state_dict())#初始阶段target net和main net是一样的参数
target_net.eval()#表示步更新,只评估输出。
optimizer = optim.RMSprop(policy_net.parameters())#使用RMSprop优化网络
memory = ReplayMemory(100000)#定义经验池的容量capacity
steps_done = 0
这边没什么可说的,大家都看得懂。
policy_net = DQN(screen_height, screen_width, n_actions).to(device)#Q
target_net = DQN(screen_height, screen_width, n_actions).to(device)#T
这两句我师妹问过我是什么意思
这个其实就是初始化模型。只是作者在写这个代码的时候还有其他参数,因此需要带参初始化。
正常情况,我们写一个模型时,初始化没这么麻烦。
#动作选择函数,首先看的就是探索和开发的阈值系数 eps[0,1]
def select_action(state):
global steps_done
sample = random.random()## 产生 0 到 1 之间的随机浮点数
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1.*steps_done / EPS_DECAY)#最小到0.427
steps_done += 1
if sample > eps_threshold:#判断是随即动作还是最优动作
#sample是(0,1),eps_threshold越来越小,一开始是选择最优策略(开发)
with torch.no_grad():#torch.no_grad()一般用于神经网络的推理阶段, 表示张量的计算过程中无需计算梯度
return policy_net(state).max(1)[1].view(1, 1)#使用最优动作
else:
#到后期会越来越趋向于(探索),u而就是随机选择一个动作。
return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)#随机选择动作
#random.randrange(N)在0-N之间随机生成一个数,N是动作空间数
plt.figure(1)
ax = plt.subplot(111)
x = np.linspace(0, 1000, 1000) # 在0到2pi之间,均匀产生200点的数组
print(x)
r1 = []
for i in range(1000):
r = 0.1 + (0.99 - 0.1) * \
math.exp(-1.*(i / 1000))
r1.append(r)
print(r1)
ax.plot(x, r1)
plt.show()
episode_durations = []#存储训练过程数据的列表 def plot_durations(): plt.figure(1) plt.clf()#清除当前图形及其所有轴,但保持窗口打开,以便可以将其重新用于其他绘图。有了这个再次运行就不要关掉所有figure了 durations_t = torch.tensor(episode_durations, dtype=torch.float)#转换成张量。 plt.title('Training...')#图的名字 plt.xlabel('Episode')#x轴坐标名 plt.ylabel('Duration')#y轴坐标名 plt.plot(durations_t.numpy())#画图 # Take 100 episode averages and plot them too if len(durations_t) >= 100: means = durations_t.unfold(0, 100, 1).mean(1).view(-1) means = torch.cat((torch.zeros(99), means)) plt.plot(means.numpy())
plt.pause(0.001) # pause a bit so that plots are updated
这个没啥说的
def optimize_model(): if len(memory) < BATCH_SIZE:#查看记忆池是否存满 return transitions = memory.sample(BATCH_SIZE)#从记忆池中随即采集BATCH_SIZE个样本 batch = Transition(*zip(*transitions))#zip表示交叉元素,*号代表拆分 # Compute a mask of non-final states and concatenate the batch elements # 计算非最终状态的掩码并连接批处理元素 # (a final state would've been the one after which simulation ended) # 最终的状态应该是模拟结束后的状态 non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),device=device, dtype=torch.bool) #首先分析map()函数,labbda是一个简单的函数。把transition中的next_state赋值给s。 #tuple()将状态转换为元组,元组是无法修改的 non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) #合并batch中的状态 32个,竖着合并到一起尺寸是:[32,[s]] action_batch = torch.cat(batch.action)#合并batch中的动作,竖着合并到一起尺寸是:[32,[a]] reward_batch = torch.cat(batch.reward)#合并batch中的奖励,竖着合并到一起尺寸是:[32,[r]]
#然后将这些数据,首先是state_batch按批次送到网络中, #策略函数输入状态:image,输出一个,512列的张量。在批处理中,应该是[32,1,512] state_action_values = policy_net(state_batch).gather(1, action_batch)#列号变动,因为是512列 next_state_values = torch.zeros(BATCH_SIZE, device=device)#32维的张量 next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() #按行求最大值,并提取对应的最大值。 expected_state_action_values = reward_batch + (next_state_values * GAMMA)#更新状态值函数 # Compute Huber loss criterion = nn.MSELoss() loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))#计算损失函数 # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
来了,来了。我之前跟学生讲课的时候经常说,向看懂一个代码,一个算法。一定要搞清楚他们数据的流向,以及数据尺寸的变换流程。
然后第一块需要详细了解代码,这两块代码跟上面两个说会在后面讲是一块的。我需要举个例子。
transitions = memory.sample(BATCH_SIZE)#从记忆池中随即采集BATCH_SIZE个样本
batch = Transition(*zip(*transitions))#zip表示交叉元素,*号代表拆分
首先第一行是从memory中随机抽取一批样本,我们默认是32.。
然后就是下面的batch了。我们具体举个例子,一看便知。
import torch
import random
from collections import namedtuple, deque
#创建一个双向数组,队列长度是100。跟上面一样的
memory = deque([], maxlen=100)
#定义我们的Transition 。跟上面一样的
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
#给Transition 实例化
s1 = Transition(2,3,4,5)
s2 = Transition(1,2,3,4)
s3 = Transition(1,4,5,2)
s4 = Transition(2,5,7,3)
#然后赋值给memory
memory.append(s1)
memory.append(s2)
memory.append(s3)
memory.append(s4)
print(memory)
#原始的memory是这样的
#deque([Transition(state=2, action=3, next_state=4, reward=5), Transition(state=1, action=2, next_state=3, reward=4), Transition(state=1, action=4, next_state=5, reward=2), Transition(state=2, action=5, next_state=7, reward=3)], maxlen=100)
#随机采样2个批次
m2 = random.sample(memory, 2)
#采样后是这样的
#[Transition(state=1, action=4, next_state=5, reward=2), Transition(state=2, action=3, next_state=4, reward=5)]
#来了来了,
batch = Transition(*zip(*m2))
print(batch)
#Transition(state=(1, 2), action=(4, 3), next_state=(5, 4), reward=(2, 5))
#batch = Transition(*zip(*transitions))这句代码的一些列操作为了把单个的s,a,r,s_都给合并到一起。
#接着上面的代码,我们逐行下下看数据的变换格式
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool)
print(non_final_mask)
#输出的是:tensor([True, True])
#也就是说,这个non_final_mask生成的是bool型变量,判断该状态是不是最终状态。
下面面大家按照这个转换格式,就知道最后走势如何处理的了。
下面看一下这个语句
state_action_values = policy_net(state_batch).gather(1, action_batch)#列号标动,因为是2列
这个gather其实不是理解的聚集。
而类似与Qtable中的查表。计算的是Q值
def random_start(skip_steps=30, m=4): env.reset()#重新初始化函数,智能体每进行一次尝试到达终止状态后,都要重新开始再尝试,所以需要智能体有重新初始化功能。 state_queue = deque([], maxlen=m) # 当前状态 m等于4表示采集四张图像,每采集4帧会跳30帧 next_state_queue = deque([], maxlen=m)#下一个状态 done = False#done又是是否结束 for i in range(skip_steps): if (i+1) <= m: #i
state_queue.append(get_screen())#则向状态序列中继续添加图像 elif m < (i + 1) <= 2*m:#如果大于4张,小于8张, next_state_queue.append(get_screen())#则将这些图像保存到下一个状态 else: state_queue.append(next_state_queue[0]) #否则的话就是大于8张,就是大于两个状态的,把上一个nextstate中的图像放到这个当前的state_queue next_state_queue.append(get_screen()) #把当前的图像继续存放到下一个状态中。 #由于两个状态容器都是用deque()的方式,因此 action = env.action_space.sample()#采集一个动作 _, _, done, _ = env.step(action)#输入动作action,输出为:下一步状态,立即回报,是否终止,调试信息 if done: break return done, state_queue, next_state_queue
# Start Training
num_episodes = 10000
m = 4 #4张图像S
for i_episode in range(num_episodes):#迭代10000次
# Initialize the environment and state初始化环境和状态
done, state_queue, next_state_queue = random_start()
if done:
continue
state = torch.cat(tuple(state_queue), dim=1)#状态转换成元组
for t in count():
reward = 0
m_reward = 0
# 每m帧完成一次action
action = select_action(state)#根据当前状态选择一个动作。
for i in range(m):
_, reward, done, _ = env.step(action.item())#与环境交互获取奖励和是否终止
if not done:#如果不是终止状态,则
next_state_queue.append(get_screen())#采集图像添加到下一个状态
else:#如果是终止状态(者打完),就跳出循环
break
m_reward += reward#增加奖励
if not done:#如果没有结束,
next_state = torch.cat(tuple(next_state_queue), dim=1)
else:#如果结束,
next_state = None#没有下一个状态,表示是死亡
m_reward = -150#那么奖励直接-150
m_reward = torch.tensor([m_reward], device=device)
memory.push(state, action, next_state, m_reward)#将这个环节的transition添加memary中
state = next_state#将这个nextstate更新为当前状态
optimize_model()#开始优化模型
if done:#如果结束了,
episode_durations.append(t + 1)#将过程数据添加到列表中
plot_durations()#画图
break
# Update the target network, copying all weights and biases in DQN
if i_episode % TARGET_UPDATE == 0:#怕那段是否达到指定步骤,到达指定步骤则更新target
target_net.load_state_dict(policy_net.state_dict())
torch.save(policy_net.state_dict(), 'weights/policy_net_weights_{0}.pth'.format(i_episode))#保存模型
print(‘Complete’)
env.close()#关闭环境
torch.save(policy_net.state_dict(), ‘weights/policy_net_weights.pth’)
详细细节大家直接运行代码可能会很麻烦
我自己写了个效地demo来验证数据的流程了
import random
import torch
from collections import namedtuple, deque
state_que = deque([], maxlen=4)
memory = deque([], maxlen=100)
Transition = namedtuple(‘Transition’,(‘state’, ‘action’, ‘next_state’, ‘reward’))
st1 = torch.rand(2,2)
st2 = torch.rand(2,2)
st3 = torch.rand(2,2)
st4 = torch.rand(2,2)
a1 = torch.ones(1)
a2 = torch.ones(1)
a3 = torch.ones(1)
a4 = torch.ones(1)
#模拟截屏代码get_screen,并将其处理成(1,1,84,84)的格式,在本文中,我是用图像格式为2*2
nst1 = torch.rand(2,2)#unsqueeze(0)
nst1 = nst1.unsqueeze(0)
nst1 = nst1.unsqueeze(0)
nst2 = torch.rand(2,2)
nst2 = nst2.unsqueeze(0)
nst2 = nst2.unsqueeze(0)
nst3 = torch.rand(2,2)
nst3 = nst3.unsqueeze(0)
nst3 = nst3.unsqueeze(0)
nst4 = torch.rand(2,2)
nst4 = nst4.unsqueeze(0)
nst4 = nst4.unsqueeze(0)
#将相应的变量添加到Transition中
s1 = Transition(st1,a1,nst1,5)
s2 = Transition(st2,a2,nst2,4)
s3 = Transition(st3,a3,nst3,2)
s4 = Transition(st4,a4,nst4,3)
#添加到state_que中
state_que.append(nst1)
state_que.append(nst2)
state_que.append(nst3)
state_que.append(nst4)
print(‘state_que’,state_que)
#转换成元组
print(‘转换成元组和拼接’)
state = torch.cat(tuple(state_que), dim=1)
print(‘state’,state)
print(‘statesize’,state.size())
memory.append(s1)
memory.append(s2)
memory.append(s3)
memory.append(s4)
#print(memory)
m2 = random.sample(memory, 2)
print(‘m2’,m2)
print()
batch = Transition(zip(m2))
print(‘zip*-----------------------’)
print(‘batch:000’,batch.state)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.state)), dtype=torch.bool)
print(non_final_mask)
state_batch = torch.cat(batch.next_state)
print(‘next_state_batch’,state_batch)
print('state_batch_size = ',state_batch.size())
action_batch = torch.cat(batch.action)
print(‘action_batch’,action_batch)