这篇文章,主要讨论DDQN。DQN的变体,它主要对 y j y_j yj的计算进行了解耦。
在以前的DQN中 y j y_j yj的计算公式如下:
y j = R j + γ m a x a ′ Q ′ ( ϕ ( S j ′ ) , A j ′ , w ′ ) y_j = R_j+γmax_{a^′}Q^′(ϕ(S^′_j),A^′_j,w^′) yj=Rj+γmaxa′Q′(ϕ(Sj′),Aj′,w′)
其中 A j ′ A^′_j Aj′是通过 ϕ ( S j ′ ) ϕ(S^′_j) ϕ(Sj′)输入当前Q网络得到的最大 Q Q Q值,在此一个改动。而是先在当前 Q Q Q网络中先找出最大 Q Q Q值对应的动作。
之前的表达式为:
A j ′ = arg max a ′ Q ( ϕ ( S j ′ ) , a , w ) A'_j=\arg\max_{a'}Q(\phi(S'_j),a,w) Aj′=argmaxa′Q(ϕ(Sj′),a,w)
现在变为:
A j ′ = arg max a ′ Q ′ ( ϕ ( S j ′ ) , a , w ) A'_j=\arg\max_{a'}Q^′(\phi(S'_j),a,w) Aj′=argmaxa′Q′(ϕ(Sj′),a,w)
最后目标 y j y_j yj变为:
y j = R j + γ Q ′ ( ϕ ( S j ′ ) , arg max a ′ Q ( ϕ ( S j ′ ) , a , w ) , w ′ ) y_j = R_j + \gamma Q'(\phi(S'_j),\arg\max_{a'}Q(\phi(S'_j),a,w),w') yj=Rj+γQ′(ϕ(Sj′),argmaxa′Q(ϕ(Sj′),a,w),w′)
其它的算法流程基本一致
算法输入:迭代轮数 T T T,状态特征维度 n n n, 动作集 A A A, 步长 α α α,衰减因子 γ γ γ, 探索率 ϵ ϵ ϵ, 当前 Q Q Q网络结构, 目标 Q ′ Q^′ Q′网络结构,批量梯度下降的样本数 m m m。
输出: Q Q Q网络参数
算法基本与之前的nature DQN 相似,改变的只有 y j y_j yj的计算方式
pytorch代码参考tensorflow代码
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 09:46:42 2019
@author: asus
"""
import gym
import torch
from collections import deque
import torch.nn.functional as F
import numpy as np
import random
GAMMA = 0.9
INITIAL_EPSILON = 0.5
FINAL_EPSILON = 0.01
REPLAY_SIZE = 10000
BATCH_SIZE = 32
ENV_NAME = 'CartPole-v0'
EPISODE = 3000 # Episode limitation
STEP = 300 # Step limitation in an episode
TEST = 10 # The number of experiment test every 100 episode
class MODEL(torch.nn.Module):
def __init__(self, env):
super(MODEL, self).__init__()
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.fc1 = torch.nn.Linear(self.state_dim, 20)
self.fc1.weight.data.normal_(0, 0.6)
self.fc2 = torch.nn.Linear(20, self.action_dim)
self.fc2.weight.data.normal_(0, 0.2)
def create_Q_network(self, x):
x = F.relu(self.fc1(x))
Q_value = self.fc2(x)
return Q_value
def forward(self, x, action_input):
Q_value = self.create_Q_network(x)
Q_action = torch.mul(Q_value, action_input).sum(dim=1)
return Q_action
class DQN():
def __init__(self, env):
self.target_Q_net = MODEL(env)
self.current_Q_net = MODEL(env)
self.replay_buffer = deque()
self.time_step = 0
self.epsilon = INITIAL_EPSILON
self.optimizer = torch.optim.Adam(params=self.current_Q_net.parameters(), lr=0.0001)
self.loss = torch.nn.MSELoss()
def perceive(self,state,action,reward,next_state,done):
one_hot_action = np.zeros(self.current_Q_net.action_dim)
one_hot_action[action] = 1
self.replay_buffer.append((state,one_hot_action,reward,next_state,done))
if len(self.replay_buffer) > REPLAY_SIZE:
self.replay_buffer.popleft()
if len(self.replay_buffer) > BATCH_SIZE:
self.train_Q_network()
def train_Q_network(self):
self.time_step += 1
# Step 1: obtain random minibatch from replay memory
minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
state_batch = [data[0] for data in minibatch]
action_batch = [data[1] for data in minibatch]
reward_batch = [data[2] for data in minibatch]
next_state_batch = torch.FloatTensor([data[3] for data in minibatch])
# Step 2: calculate y
y_batch = []
current_a = self.current_Q_net.create_Q_network(next_state_batch)
max_current_action_batch = torch.argmax(current_a, axis=1)
Q_value_batch = self.target_Q_net.create_Q_network(next_state_batch)
for i in range(0,BATCH_SIZE):
done = minibatch[i][4]
if done:
y_batch.append(reward_batch[i])
else:
max_current_action = max_current_action_batch[i]
y_batch.append(reward_batch[i] + GAMMA * Q_value_batch[i,max_current_action])
y = self.current_Q_net(torch.FloatTensor(state_batch), torch.FloatTensor(action_batch))
y_batch = torch.FloatTensor(y_batch)
cost = self.loss(y, y_batch)
self.optimizer.zero_grad()
cost.backward()
self.optimizer.step()
def egreedy_action(self,state):
Q_value = self.current_Q_net.create_Q_network(torch.FloatTensor(state))
if random.random() <= self.epsilon:
self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 10000
return random.randint(0, self.current_Q_net.action_dim - 1)
else:
self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 10000
return torch.argmax(Q_value).item()
def action(self,state):
return torch.argmax(self.target_Q_net.create_Q_network(torch.FloatTensor(state))).item()
def update_target_params(self):
torch.save(self.current_Q_net.state_dict(), 'net_params.pkl')
self.target_Q_net.load_state_dict(torch.load('net_params.pkl'))
def main():
# initialize OpenAI Gym env and dqn agent
env = gym.make(ENV_NAME)
agent = DQN(env)
for episode in range(EPISODE):
# initialize task
state = env.reset()
# Train
for step in range(STEP):
action = agent.egreedy_action(state) # e-greedy action for train
next_state,reward,done,_ = env.step(action)
# Define reward for agent
print(reward)
reward = -1 if done else 0.1
agent.perceive(state,action,reward,next_state,done)
state = next_state
if done:
break
# Test every 100 episodes
if episode % 100== 0:
total_reward = 0
for i in range(TEST):
state = env.reset()
for j in range(STEP):
# env.render()
action = agent.action(state) # direct action for test
state,reward,done,_ = env.step(action)
total_reward += reward
if done:
break
ave_reward = total_reward/TEST
print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)
agent.update_target_params()
if __name__ == '__main__':
main()
参考文献:
https://www.cnblogs.com/pinard/p/9778063.html