1、Double DQN算法:
Selection using DQN:
a ⋆ = argmax a Q ( s t + 1 , a ; w ) . a^{\star}=\operatorname*{argmax}_{a}Q(s_{t+1},a;\mathbf{w}). \\ a⋆=aargmaxQ(st+1,a;w).
Evaluation using target network:
y t = r t + γ ⋅ Q ( s t + 1 , a ⋆ ; w − ) . y_{t}=r_{t}+\gamma\cdot Q(s_{t+1},a^{\star};\mathbf{w}^{-}). \\ yt=rt+γ⋅Q(st+1,a⋆;w−).
2、Target Network算法:
Selection using target network:
a ⋆ = argmax a Q ( s t + 1 , a ; w − ) . a^{\star}=\operatorname*{argmax}_{a}Q(s_{t+1},a;\mathbf{w}^{-}). \\ a⋆=aargmaxQ(st+1,a;w−).
Evaluation using target network:
y t = r t + γ ⋅ Q ( s t + 1 , a ⋆ ; w − ) . y_{t}=r_{t}+\gamma\cdot Q(s_{t+1},a^{\star};\mathbf{w}^{-}). \\ yt=rt+γ⋅Q(st+1,a⋆;w−).
3、对比
在动作选择步骤不同,Double DQN使用DQN选择动作,Target Network使用target network选择,Double DQN更优,因为:
Q ( s t + 1 , a ⋆ ; w − ) ≤ max a Q ( s t + 1 , a ; w − ) . Q(s_{t+1},a^\star;\mathbf{w}^-)\quad\leq\quad\max_aQ(s_{t+1},a;\mathbf{w}^-). Q(st+1,a⋆;w−)≤amaxQ(st+1,a;w−).
4、算法实现:
class QNet(nn.Module):
"""QNet.
Input: feature
Output: num_act of values
"""
def __init__(self, dim_obs, num_act):
super().__init__()
self.fc1 = nn.Linear(dim_obs, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, num_act)
def forward(self, obs):
x = F.relu(self.fc1(obs))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
class DoubleDQN:
def __init__(self, dim_obs=None, num_act=None, discount=0.9):
self.discount = discount
self.model = QNet(dim_obs, num_act)
self.target_model = QNet(dim_obs, num_act)
self.target_model.load_state_dict(self.model.state_dict())
def get_action(self, obs):
qvals = self.model(obs)
return qvals.argmax()
def compute_loss(self, s_batch, a_batch, r_batch, d_batch, next_s_batch):
# Compute current Q value based on current states and actions.
qvals = self.model(s_batch).gather(1, a_batch.unsqueeze(1)).squeeze()
# next state的value不参与导数计算,避免不收敛。
next_qvals, _ = self.target_model(next_s_batch).detach().max(dim=1)
loss = F.mse_loss(r_batch + self.discount * next_qvals * (1 - d_batch), qvals)
return loss
class TargetNetworkDQN:
def __init__(self, dim_obs=None, num_act=None, discount=0.9):
self.discount = discount
self.model = QNet(dim_obs, num_act)
self.target_model = QNet(dim_obs, num_act)
self.target_model.load_state_dict(self.model.state_dict())
def get_action(self, obs):
qvals = self.model(obs)
return qvals.argmax()
def compute_loss(self, s_batch, a_batch, r_batch, d_batch, next_s_batch):
# Compute current Q value based on current states and actions.
qvals = self.target_model(s_batch).gather(1, a_batch.unsqueeze(1)).squeeze()
# next state的value不参与导数计算,避免不收敛。
next_qvals, _ = self.target_model(next_s_batch).detach().max(dim=1)
loss = F.mse_loss(r_batch + self.discount * next_qvals * (1 - d_batch), qvals)
return loss