Double Deep Q-Learning Netwok (DQN),基础理论来自于这篇论文。基础理论部分,参考这篇笔记和这篇笔记。下面给出最核心的强化学习公式:
Y t D o u b l e Q = R t + 1 + γ Q ^ ( S t + 1 , a r g m a x a Q ( S t + 1 , a ) ) Y_{t}^{DoubleQ} = R_{t+1}+\gamma \hat{Q}\left(S_{t+1},\mathop{argmax}_{a}Q\left(S_{t+1},a\right)\right) YtDoubleQ=Rt+1+γQ^(St+1,argmaxaQ(St+1,a))
算法利用了两个结构相同,但是参数不同的神经网络
首先是 Q Q Q网络,这就是DQN中的 Q Q Q网络,是为了用来训练的神经网络。 Q ^ \hat{Q} Q^网络与 Q Q Q的架构相同,只不过参数是某几步之前的,这是为了计算评估分数使用的。上面的公式含义如下:
代码基础框架来自于这篇博客。
import tensorflow as tf
from tensorflow import keras
from collections import deque
import numpy as np
import random
MAX_LEN = 10000
BATCH_SIZE = 64
GAMMA = 0.95
EXPLORATION_DECAY = 0.995
EXPLORATION_MIN = 0.1
class Agent(object):
def __init__(self, input_space, output_space, lr=0.001, exploration=0.9, update_model_step=10):
self._model = keras.Sequential()
self._model.add(keras.layers.Dense(input_shape=(input_space,), units=24, activation=tf.nn.relu))
self._model.add(keras.layers.Dense(units=24, activation=tf.nn.relu))
self._model.add(keras.layers.Dense(units=output_space, activation='linear'))
self._model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr))
self._replayBuffer = deque(maxlen=MAX_LEN)
self._exploration = exploration
self._target_model = keras.models.clone_model(self._model)
self._update_model_step = update_model_step # 更新模型需要的最少步数
self._cur_step = 0 # 当前使用模型计算的次数
def update_target_model(self):
self._target_model.set_weights(self._model.get_weights())
@property
def exploration(self):
return self._exploration
def add_data(self, state, action, reward, state_next, done):
self._replayBuffer.append((state, action, reward, state_next, done))
def act(self, state):
if np.random.uniform() <= self._exploration:
return np.random.randint(0, 2)
action = self._model.predict(state)
return np.argmax(action[0])
def train_from_buffer(self):
if len(self._replayBuffer) < BATCH_SIZE:
return
batch = random.sample(self._replayBuffer, BATCH_SIZE)
for state, action, reward, state_next, done in batch:
new_action = np.argmax(self._model.predict(state_next)[0])
q_update = reward
if not done:
# 这是DDQN公式
q_update = reward + GAMMA * self._target_model.predict(state_next)[0][new_action]
# q_update += GAMMA * np.amax(self._model.predict(state_next)[0]) # 注释掉的是DQN
q_values = self._model.predict(state)
q_values[0][action] = q_update
self._model.fit(state, q_values, verbose=0)
self._exploration *= EXPLORATION_DECAY
self._exploration = max(EXPLORATION_MIN, self._exploration)
import gym
from Agent import Agent
import numpy as np
import matplotlib.pyplot as plt
def train():
env = gym.make("CartPole-v1")
input_space = env.observation_space.shape[0]
output_space = env.action_space.n
print(input_space, output_space)
agent = Agent(input_space, output_space)
run = 0
x = []
y = []
while run < 100:
run += 1
state = env.reset()
state = np.reshape(state, [1, -1])
step = 0
while True:
step += 1
# env.render()
action = agent.act(state)
state_next, reward, done, _ = env.step(action)
reward = reward if not done else -reward
state_next = np.reshape(state_next, [1, -1])
agent.add_data(state, action, reward, state_next, done)
state = state_next
if done:
print("Run: " + str(run) + ", exploration: " +
str(agent.exploration) + ", score:" + str(step))
# 这里是每个episode更新一次,也可以根据实际调整
agent.update_target_model()
x.append(run)
y.append(step)
break
agent.train_from_buffer()
plt.plot(x, y)
plt.show()
if __name__ == "__main__":
train()
学习率是0.001,100个批次的训练,batch-size是64。每个批次执行一次参数更新。波动属于正常现象,基本能在15步之后取得较大的优势。