我们之前利用的是Q-Table查询的方法得到每一个State应该做的动作,但在State很多的时候,我们无法存储那么多的State。因此,另一种解决方案是通过函数(神经网络拟合),具体来说有2种
我们对第二种方法继续讨论:
Experience Replay
CartPole是OpenAI Gym中的一个环境。
注意:这里的Action并不会导致Velocity的线性增加。
Reward
Starting State
Episode Termination (episode终止条件)
'''
用于测试gym中这个CartPole的环境
'''
import gym
env = gym.make('CartPole-v0')
env = env.unwrapped
N_ACTIONS = env.action_space.n # env.action_space Discrete(2)
N_STATES = env.observation_space.shape[0] # env.observation_space.shape (4,)
# dummy_policy
# 向左倾斜,向左移动
def get_action(s):
if(s[2]<0):
return 0
else:
return 1
s = env.reset() # 这里S存储的是observation的4个量
for i in range(10):
s = env.reset()
print("test",i,"begins")
reward = 0
while True:
env.render() # 更新到State s
a = get_action(s)
s_, r, done, info = env.step(a)
s = s_
reward += r
if done:
break
print("test", i, "ends with reward",reward)
这种策略的reward在50左右就是上限了。
# 后面的Eval和Target的网络架构都是这个Net
# 它由2层组成,最后输出的是N_ACTIONS个值。
class Net(nn.Module):
# 列出我们的网络有哪些层
def __init__(self,):
super(Net,self).__init__()
self.fc1 = nn.Linear(N_STATES,50)
self.fc1.weight.data.normal_(0,0.1) # 权值初始化
self.out = nn.Linear(50,N_ACTIONS)
self.out.weight.data.normal_(0,0.1) # 权值初始化
# 给出这些层是怎么连接在一起的
def forward(self,x):
x = self.fc1(x)
x = F.relu(x)
actions_value = self.out(x)
return actions_value
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
MAX_ITER = 400 # 训练 MAX_ITER个Episode
TARGET_REPLACE_ITER = 100 # 每过TARGET_REPLACE_ITER个step就把Eval_Iter的参数抄给Train_Iter
MEMORY_CAPACITY = 2000 #记录2000个Transition
LR = 0.01 # 学习率
EPSILON = 0.9 # Epsilon-Greedy
GAMMA = 0.9 # Discount Factor
BATCH_SIZE = 32
# 环境相关变量
env = gym.make('CartPole-v0')
env = env.unwrapped
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]
# 后面的Eval(Predict)和Target的网络架构都是这个Net
# 它由2层组成,最后输出的是N_ACTIONS个值。
class Net(nn.Module):
# 列出我们的网络有哪些层
def __init__(self,):
super(Net,self).__init__()
self.fc1 = nn.Linear(N_STATES,50)
self.fc1.weight.data.normal_(0,0.1) # 权值初始化
self.out = nn.Linear(50,N_ACTIONS)
self.out.weight.data.normal_(0,0.1) # 权值初始化
# 给出这些层是怎么连接在一起的
def forward(self,x):
x = self.fc1(x)
x = F.relu(x)
actions_value = self.out(x)
return actions_value
class DQN(object):
def __init__(self):
# 建立 target net 和 eval net 还有 memory
self.eval_net = Net()
self.target_net = Net()
self.learn_step_counter = 0 # 我们每过TARGET_REPLACE_ITER个step就把Eval_Iter的参数抄给Train_Iter
self.memory_counter = 0 # 用于覆盖记忆
self.memory = np.zeros((MEMORY_CAPACITY,N_STATES*2+2)) # 因为是s,a,r,s'所以是2*N_STATES+2
self.optimizer = torch.optim.Adam(self.eval_net.parameters(),lr=LR) # 我们只训练eval_net
self.loss_func = nn.MSELoss()
# 基于Epsilon—Greedy的动作选择机制
def choose_action(self, x):
x = torch.unsqueeze(torch.FloatTensor(x),0) # shape [N_STATES]->[1,N_STATES]
if np.random.uniform() < EPSILON:
actions_value = self.eval_net.forward(x)
action = torch.max(actions_value,1)[1].data.numpy()[0]
else:
action = np.random.randint(0,N_ACTIONS) # 随机从动作空间中选一个,这里动作空间是Discreted的
return action
def store_transition(self, s, a, r, s_):
transition = np.hstack((s,[a,r],s_)) # 用于将多个list变为一个list
index = self.memory_counter % MEMORY_CAPACITY # 覆盖之前的经验
self.memory[index,:] = transition
self.memory_counter += 1
# 每一个step(选一次动作)就会调用一次learn
def learn(self):
# target 网络更新
# 学习记忆库中的记忆
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict()) # 把eval的权值抄袭过来
self.learn_step_counter +=1
# sample batch transitions
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE) # 从[0,MEMORY_CAPACITY]选取BATCH_SIZE个数
b_memory = self.memory[sample_index, :] # [s,a,r,s']
# 按照维度瓜分财产
b_s = torch.FloatTensor(b_memory[:, :N_STATES])
b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, N_STATES + 1:N_STATES + 2])
b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])
# q_eval w.r.t the action in experience
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach() # train_net 只做推断,不需要更新它的权重,只有每过TARGET_REPLACE_ITER个step才更新一次
q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1) # shape (batch, 1)
loss = self.loss_func(q_eval, q_target)
loss = self.loss_func(q_eval,q_target)
self.optimizer.zero_grad() # 因为PyTorch里面的权重是累计的
loss.backward() # 计算梯度
self.optimizer.step()
dqn = DQN()
print('\nCollecting experience...')
for i in range(400):
print(f"i={i} memory_counter = {dqn.memory_counter}")
# 开局重置状态 + 累计奖励清零
s = env.reset()
ep_r = 0
while True:
env.render()
a = dqn.choose_action(s)
s_,r,done,info = env.step(a)
# 这里我们自己创造自己的奖励
# 奖励由2部分组成
# r1通过计算棍子距离中心的距离的奖励(偏离越多,值越少)
# r2通过计算棍子的倾角(倾斜得越多,奖励越少)
x, x_dot, theta, theta_dot = s_
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
r = r1 + r2
dqn.store_transition(s, a, r, s_)
ep_r += r # 更新累计奖励
if dqn.memory_counter > MEMORY_CAPACITY:
print("Start Learning")
dqn.learn()
# 我们省略了前面采集数据的Episode
if done:
print('Ep: ', i,'| Ep_r: ', round(ep_r, 2))
if done:
break
s = s_