全部代码:
https://github.com/ColinFred/Reinforce_Learning_Pytorch/tree/main/RL/DQN
查看可用的环境
from gym import envs
print(envs.registry.all())
ValuesView(├──CartPole: [ v0, v1 ]
├──MountainCar: [ v0 ]
├──MountainCarContinuous: [ v0 ]
├──Pendulum: [ v1 ]
├──Acrobot: [ v1 ]
├──LunarLander: [ v2 ]
├──LunarLanderContinuous: [ v2 ]
├──BipedalWalker: [ v3 ]
├──BipedalWalkerHardcore: [ v3 ]
├──CarRacing: [ v1 ]
├──Blackjack: [ v1 ]
├──FrozenLake: [ v1 ]
├──FrozenLake8x8: [ v1 ]
├──CliffWalking: [ v0 ]
├──Taxi: [ v3 ]
├──Reacher: [ v2 ]
├──Pusher: [ v2 ]
├──Thrower: [ v2 ]
├──Striker: [ v2 ]
├──InvertedPendulum: [ v2 ]
依旧使用CartPole-v1的环境,改写reward的值
# print(env.action_space) # number of action
# print(env.observation_space) # number of state
# print(env.observation_space.high)
# print(env.observation_space.low)
NUM_ACTIONS = env.action_space.n
NUM_STATES = env.observation_space.shape[0]
ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample.shape
RL = DQN(n_action=NUM_ACTIONS, n_state=NUM_STATES, learning_rate=0.01) # choose algorithm
total_steps = 0
for episode in range(1000):
state, info = env.reset(return_info=True)
ep_r = 0
while True:
env.render() # update env
action = RL.choose_action(state) # choose action
state_, reward, done, info = env.step(action) # take action and get next state and reward
x, x_dot, theta, theta_dot = state_ # change given reward
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
reward = r1 + r2 # consider both locations and radians
RL.store_transition(state, action, reward, state_) # store transition
RL.learn() # learn
ep_r += reward
if total_steps % C == 0: # every C steps update target_network
RL.update_target_network()
if done:
print('episode: ', episode,
'ep_r: ', round(ep_r, 2))
break
state = state_
total_steps += 1
time.sleep(0.05)
代码:
1,经验回放
先进行若干次游戏,将游戏数据存储到memory中。从memory中随机选取训练数据batch_memory用于批量训练。
Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward')) # Define a transition tuple
class ReplayMemory(object): # Define a replay memory
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0
def Push(self, *args):
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(*args)
self.position = (self.position + 1) % self.capacity
def Sample(self, batch_size):
return sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
2,DNN网络
输入的维度是状态的树木,输出的是Q(s,a)的在状态s的各个Q值。对初始权重进行处理,使其服从均值0,方差0.1的正态分布。
class DNN(nn.Module): # Double DQN
def __init__(self, n_state, n_action): # Define the layers of the fully-connected hidden network
super(DNN, self).__init__()
self.input_layer = nn.Linear(n_state, 64)
self.input_layer.weight.data.normal_(0, 0.1)
self.middle_layer = nn.Linear(64, 32)
self.middle_layer.weight.data.normal_(0, 0.1)
self.middle_layer_2 = nn.Linear(32, 32)
self.middle_layer_2.weight.data.normal_(0, 0.1)
self.adv_layer = nn.Linear(32, n_action)
self.adv_layer.weight.data.normal_(0, 0.1)
def forward(self, state):
x = F.relu(self.input_layer(state))
x = F.relu(self.middle_layer(x))
x = F.relu(self.middle_layer_2(x))
out = self.adv_layer(x)
return out
DQN,难点在于理解 learn()函数,要求对pytorch的各个函数较熟悉。
class DQN:
def __init__(self, n_action, n_state, learning_rate):
self.n_action = n_action
self.n_state = n_state
self.memory = ReplayMemory(capacity=100)
self.memory_counter = 0
self.model_policy = DNN(self.n_state, self.n_action)
self.model_target = DNN(self.n_state, self.n_action)
self.model_target.load_state_dict(self.model_policy.state_dict())
self.model_target.eval()
self.optimizer = optim.Adam(self.model_policy.parameters(), lr=learning_rate)
def store_transition(self, s, a, r, s_):
state = torch.FloatTensor([s])
action = torch.LongTensor([a])
reward = torch.FloatTensor([r])
next_state = torch.FloatTensor([s_])
self.memory.Push(state, action, next_state, reward)
def choose_action(self, state):
state = torch.FloatTensor(state)
if np.random.randn() <= EPISILO: # greedy policy
with torch.no_grad():
q_value = self.model_policy(state)
action = q_value.max(0)[1].view(1, 1).item()
else: # random policy
action = torch.tensor([randrange(self.n_action)], dtype=torch.long).item()
return action
def learn(self):
if len(self.memory) < BATCH_SIZE:
return
transitions = self.memory.Sample(BATCH_SIZE)
batch = Transition(*zip(*transitions))
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action).unsqueeze(1)
reward_batch = torch.cat(batch.reward)
next_state_batch = torch.cat(batch.next_state)
state_action_values = self.model_policy(state_batch).gather(1, action_batch)
next_action_batch = torch.unsqueeze(self.model_target(next_state_batch).max(1)[1], 1)
next_state_values = self.model_target(next_state_batch).gather(1, next_action_batch)
expected_state_action_values = (next_state_values * GAMMA) + reward_batch.unsqueeze(1)
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
self.optimizer.zero_grad()
loss.backward()
for param in self.model_policy.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def update_target_network(self):
self.model_target.load_state_dict(self.model_policy.state_dict())
DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步,来达到消除过度估计的问题。在DDQN这里,不再是直接在目标Q网络里面找各个动作中最大Q值,而是先在当前Q网络中先找出最大Q值对应的动作。然后利用这个选择出来的动作在目标Q网络里面去计算目标Q值。在代码的体现上就是将104行的
next_action_batch = torch.unsqueeze(self.model_target(next_state_batch).max(1)[1], 1)
改为
next_action_batch = torch.unsqueeze(self.model_policy(next_state_batch).max(1)[1], 1)
非常简单的改动
基本思路就是Q(s,a)的值既和state有关,又和action有关。但是两种"有关"的程度不一样,或者说影响力不一样。对于Q(s,a) 我们希望它能反应出两个方面的差异。
1,对于当前状态s,能够很好的区分不同action的影响
2,对于不同状态s,能够很好的区分不同state的影响
将Q值分成V和A,关系为:Q = V + A
V评价当前状态,是一个值。A评价动作,与Q同维度。
也就是将DNN网络改为
class DNN(nn.Module): # D3QN
def __init__(self, n_state, n_action): # Define the layers of the fully-connected hidden network
super(DNN, self).__init__()
self.input_layer = nn.Linear(n_state, 64)
self.input_layer.weight.data.normal_(0, 0.1)
self.middle_layer = nn.Linear(64, 32)
self.middle_layer.weight.data.normal_(0, 0.1)
self.middle_layer_2 = nn.Linear(32, 32)
self.middle_layer_2.weight.data.normal_(0, 0.1)
self.adv_layer = nn.Linear(32, n_action)
self.adv_layer.weight.data.normal_(0, 0.1)
self.value_layer = nn.Linear(32, 1)
self.value_layer.weight.data.normal_(0, 0.1)
def forward(self, state): # Define the neural network forward function
x = F.relu(self.input_layer(state))
x = F.relu(self.middle_layer(x))
x = F.relu(self.middle_layer_2(x))
value = self.value_layer(x)
adv = self.adv_layer(x)
out = value + adv - adv.mean()
return out
Q = V + A
一般算法中都要减去一个A的均值,是为了对A做去中心化处理。
Q = V + A - A.mean()
如果不这样做,假设Q值为[11,12,13,14,15],那么V和A会有无数种结果,比如V=10,A=[1,2,3,4,5],或者V=9,A=[2,3,4,5,6]等等。
减去均值后,就有了唯一的V和A:V=13, A=[-2,-1,0,1,2]
调节超参数
MEMORY_CAPACITY = 1000
C = 50
BATCH_SIZE = 32
LR = 0.01
GAMMA = 0.90
EPISILO = 0.9
TEST_EPISODE = 30
分别运行使用DQN、Double DQN、Dueling DQN ,可以直观的显示它们的性能。