现在网上已有的DDPG实现多是TF 1.X时的风格,相较之下实现不够优雅,也不够简洁直观。而在使用TF 2.X实现的时候又有诸多坑在等着咱,一边填坑一边还得防止自己给自己挖坑,要是一个不小心这篇文章就没了。。。为了便于像我一样踩坑不断的小伙伴参考,在此记下实现过程中一些需要注意的地方。在试验的过程中,我还发现了TF 2.X默认开启的动态图机制会对运行速度有极大的负面影响,因此也顺便记下如何关闭动态图机制以及解决在这个过程中checkpoint manager保存模型时默认session为None的问题。
from collections import deque
import gym
import numpy as np
import os
import pickle
import random
import tensorflow as tf
from tensorflow.keras import Input
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
tf.compat.v1.disable_eager_execution() # 关闭动态图机制
class DDPGTrainer():
def __init__(
self,
n_features,
n_actions,
sample_size=128,
tau=0.99,
gamma=0.95,
epsilon=1.0,
epsilon_decay=0.995,
epsilon_min=0.01,
a_lr=0.0001,
c_lr=0.001
):
self.tau = tau
self.memory_buffer = deque(maxlen=4000)
self.sample_size = sample_size
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
self.a_lr = a_lr
self.c_lr = c_lr
self.n_features = n_features
self.n_actions = n_actions
self.actor, self.critic = self.build_model()
self.target_actor, self.target_critic = self.build_model()
self.target_actor.set_weights(self.actor.get_weights())
self.target_critic.set_weights(self.critic.get_weights())
def build_model(self):
s_input = Input([self.n_features])
a_input = Input([1])
# actor
x = Dense(units=40, activation='relu')(s_input)
x = Dense(units=40, activation='relu')(x)
x = Dense(units=1, activation='tanh')(x)
action = Lambda(lambda x: x * self.n_actions)(x)
actor = tf.keras.models.Model(inputs=s_input, outputs=action)
# critic
x = K.concatenate([s_input, a_input], axis=-1)
x = Dense(40, activation='relu')(x)
x = Dense(40, activation='relu')(x)
q_a_value = Dense(1, activation='linear')(x)
critic = tf.keras.models.Model(inputs=[s_input, a_input], outputs=q_a_value)
actor.add_loss(-critic([s_input, action])) # 最大化Q_a,注意有个负号
### 只训练actor
critic.trainable = False
actor.compile(optimizer=tf.keras.optimizers.Adam(self.a_lr))
critic.trainable = True
### 只训练critic
actor.trainable = False
critic.trainable = True # 由于actor的计算图用到critic部分,actor.trainable变化会影响critic.trainable
critic.compile(optimizer=tf.keras.optimizers.Adam(self.c_lr), loss='mse')
actor.trainable = True
return actor, critic
def OU(self, x, mu=0, theta=0.15, sigma=0.2):
'''Ornstein-Uhlenbeck噪声'''
return theta * (mu - x) + sigma * np.random.randn(1) # shape: [1]
def choose_action(self, state):
action = self.actor.predict(state)[0][0] # shape: []
noise = max(self.epsilon, 0) * self.OU(action)
action = np.clip(action + noise, -self.n_actions, self.n_actions) # shape: [1]
return action
def store(self, state, action, reward, next_state, done):
sample = (state, action, reward, next_state, done)
self.memory_buffer.append(sample)
def update_epsilon(self):
if self.epsilon >= self.epsilon_min:
self.epsilon *= self.epsilon_decay
def update_model(self):
samples = random.sample(self.memory_buffer, self.sample_size)
states = np.array([sample[0] for sample in samples])
actions = np.array([sample[1] for sample in samples])
rewards = np.array([sample[2] for sample in samples])
next_states = np.array([sample[3] for sample in samples])
dones = np.array([sample[4] for sample in samples])
next_actions = self.target_actor.predict(next_states)
q_a_next = self.target_critic.predict([next_states, next_actions]) # q_a_next.shape: [self.sample_size, 1]
y = rewards + self.gamma * q_a_next[:, 0] * ~dones # y.shape: [self.sample_size]
self.critic.fit([states, actions], y[:, None], verbose=0)
self.actor.fit(states, verbose=0)
def update_target_model(self):
actor_weights = self.actor.get_weights()
critic_weights = self.critic.get_weights()
actor_target_weights = self.target_actor.get_weights()
critic_target_weights = self.target_critic.get_weights()
for i in range(len(actor_target_weights)):
actor_target_weights[i] = actor_target_weights[i] * self.tau + (1 - self.tau) * actor_weights[i]
for i in range(len(critic_target_weights)):
critic_target_weights[i] = critic_target_weights[i] * self.tau + (1 - self.tau) * critic_weights[i]
self.target_actor.set_weights(actor_target_weights)
self.target_critic.set_weights(critic_target_weights)
def save(self, checkpoint_path='pendulum'):
self.ckpt_manager.save()
with open(f'{checkpoint_path}/epsilon.pkl', 'wb') as f:
pickle.dump(self.epsilon, f)
def load(self, checkpoint_path='pendulum'):
ckpt = tf.train.Checkpoint(
actor=self.actor,
critic=self.critic,
target_actor=self.target_actor,
target_critic=self.target_critic,
actor_optimizer = self.actor.optimizer,
critic_optimizer = self.critic.optimizer,
)
self.ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)
if os.path.exists(f'{checkpoint_path}/epsilon.pkl'):
with open(f'{checkpoint_path}/epsilon.pkl', 'rb') as f:
self.epsilon = pickle.load(f)
if self.ckpt_manager.latest_checkpoint:
status = ckpt.restore(self.ckpt_manager.latest_checkpoint)
status.run_restore_ops() # 关闭动态图后需要添加这句执行restore操作
构建模型过程中,需要注意actor由于计算loss时用到了critic模型,在设置actor.trainable时critic.trainable也会跟着改变。另外训练过程中数据的shape要合适,比如critic.fit时真实值y.shape需要与critic的输出一致。
在静态图下,为了能够使用checkpoint manager保存模型,需要在构建模型之前创建一个默认session,否则checkpoint manager保存过程会报错session为None,之后如果不需要再保存模型可关闭session。具体代码如下
session = tf.compat.v1.InteractiveSession() # 关闭动态图后,ckpt_manager.save()需要有默认的session
env = gym.make('Pendulum-v0')
model = DDPGTrainer(env.observation_space.shape[0], env.action_space.high[0])
model.load() # 如果没有则不会加载
try:
for episode in range(200):
next_state = env.reset()
reward_sum = 0
for step in range(200):
env.render()
state = next_state
action = model.choose_action(state[None])
next_state, reward, done, _ = env.step(action)
reward_sum += reward
model.store(state, action, reward, next_state, done)
if len(model.memory_buffer) > model.sample_size:
model.update_model()
model.update_target_model()
model.update_epsilon()
print(f'episode{episode} total reward: {reward_sum}')
model.save()
finally:
env.close()
session.close()