DDPG代码实现
文章目录
-
-
DDPG代码实现
-
- 代码及解释
-
- 1.超参数设定
- 2.ReplayBuffer的实现
- 3.Agent类的实现
-
- 3.1.\__init__
-
- 创建策略网络(actor)
- 创建价值网络
- 复制网络参数
- \__init__
- 3.2.ema_update
- 3.3.get_action
- 3.4.replay
- 3.5.train
- 3.6.test_episode
-
- 3.7.saveModel
- 3.8.loadModel
- 4.主程序
- 训练结果
-
代码及解释
1.超参数设定
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', default=True)
parser.add_argument('--random_seed', type=int, default=0)
parser.add_argument('--render', type=bool, default=False)
parser.add_argument('--train_episodes', type=int, default=5000)
parser.add_argument('--test_episodes', type=int, default=10)
parser.add_argument('--max_steps', type=int, default=200)
parser.add_argument('--gamma', type=float, default=0.9)
parser.add_argument('--lr_a', type=float, default=0.001)
parser.add_argument('--lr_c', type=float, default=0.002)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--var', type=float, default=2)
parser.add_argument('--tau', type=float, default=0.01)
args = parser.parse_args()
ALG_NAME = 'DDPG'
ENV_ID = 'Pendulum-v0'
2.ReplayBuffer的实现
- ReplayBuffer的实现与DQN中的相同。(代码连接在文末)
import random
class ReplayBuffer:
def __init__(self, capacity=10000):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = int((self.position + 1) % self.capacity)
def sample(self, batch_size = args.batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = map(np.stack, zip(*batch))
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
3.Agent类的实现
- Agent类主要实现8个方法。
- _init_:初始化agent。
- ema_update:实现target网络参数的软更新。
- get_action:选择动作。
- replay:更新价值网络和策略网络的参数。
- train:用于采集训练模型所需要的参数。
- test_episode:用于测试模型。
- saveModel:保存模型。
- loadModel:加载模型。
3.1._init_
创建策略网络(actor)
import tensorflow as tf
import tensorlayer as tl
def get_actor(input_state_shape, name=''):
input_layer = tl.layers.Input(input_state_shape, name='A_input')
layer = tl.layers.Dense(n_units=64, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(input_layer)
layer = tl.layers.Dense(n_units=64, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l2')(layer)
layer = tl.layers.Dense(n_units=self.action_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(layer)
layer = tl.layers.Lambda(lambda x: self.action_range * x)(layer)
return tl.models.Model(inputs=input_layer, outputs=layer, name='Actor' + name)
- W_init和b_init是初始化的参数。
- 输出前要经过一个tanh函数,目的是为了把输出限制到[-1,1],再将输出根据具体的环境进行缩放才输出。
创建价值网络
def get_critic(input_state_shape,
state_input = tl.layers.Input(input_state_shape, name='C_s_input')
action_input = tl.layers.Input(input_action_shape, name='C_a_input')
layer = tl.layers.Concat(1)([state_input, action_input])
layer = tl.layers.Dense(n_units=64, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l1')(layer)
layer = tl.layers.Dense(n_units=64, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l2')(layer)
layer = tl.layers.Dense(n_units=1, W_init=W_init, b_init=b_init, name='C_out')(layer)
return tl.models.Model(inputs=[state_input, action_input], outputs=layer, name='Critic' + name)
- tl.layers.Concat(concat_dim)的作用是把array沿着某一个维度(concat_dim)接在一起。以Pendulum-v0为例:
print(state_input)
print(action_input)
print(tl.layers.Concat(1)([state_input, action_input]))
tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float32)
tf.Tensor([[1.]], shape=(1, 1), dtype=float32)
tf.Tensor([[1. 1. 1. 1.]], shape=(1, 4), dtype=float32)
复制网络参数
def copy_para(from_model, to_model):
for i, j in zip(from_model.trainable_weights, to_model.trainable_weights):
j.assign(i)
_init_
def __init__(self, env):
self.env = env
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.shape[0]
self.action_range = env.action_space.high
W_init = tf.random_normal_initializer(mean=0, stddev=0.3)
b_init = tf.constant_initializer(0.1)
self.actor = get_actor([None, self.state_dim])
self.critic = get_critic([None, self.state_dim], [None, self.action_dim])
self.actor.train()
self.critic.train()
self.actor_target = get_actor([None, self.state_dim], name='_target')
copy_para(self.actor, self.actor_target)
self.actor_target.eval()
self.critic_target = get_critic([None, self.state_dim], [None, self.action_dim], name='_target')
copy_para(self.critic, self.critic_target)
self.critic_target.eval()
self.ema = tf.train.ExponentialMovingAverage(decay=1 - args.tau)
self.actor_opt = tf.optimizers.Adam(args.lr_a)
self.critic_opt = tf.optimizers.Adam(args.lr_c)
self.replay_buffer = ReplayBuffer()
- tf.random_normal_initializer:函数返回一个用正态分布产生张量的初始化器。
- tf.constant_initializer:函数返回一个生成具有常量值的张量的初始化器。
- 主要参数有value,其值是Python标量、值列表或元组,或n维Numpy数组。初始化变量的所有元素将根据value参数设置为对应的值。
- tf.train.ExponentialMovingAverage:滑动平均接口,用于更新(target)模型参数。
- value = (1-τ)·value+τ·old_value
- decay(=1-τ)一般设置为非常接近1的数。
- 滑动平均可以降低周期性的干扰,在波动频率较高的场合有很好的效果。
3.2.ema_update
def ema_update(self):
paras = self.actor.trainable_weights + self.critic.trainable_weights
self.ema.apply(paras)
for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
i.assign(self.ema.average(j))
- trainable_weights和trainable_weights都是一维列表,里面存放的元素是tensorflow用于存放参数的类。
- 两个列表的“+”操作可以实现拼接:
a = [1,2,3]
b = [4,5,6,7]
print(a+b)
3.3.get_action
import numpy as np
def get_action(self, state, greedy=False):
action = self.actor(np.array([state]))[0]
if greedy:
return action
return np.clip(
np.random.normal(action, args.var), -self.action_range, self.action_range
).astype(np.float32)
- np.random.normal函数可以使用正太分布随机抽一个数。
- 这这里,action是正态分布的均值,args.var是标准差。
- 使用np.random.normal函数可以给actor输出的动作增加随机性。
- clip函数保证了输出的范围合法。
3.4.replay
def replay(self):
args.var *= .9995
states, actions, rewards, next_states, done = self.replay_buffer.sample(args.batch_size)
rewards = rewards[:, np.newaxis]
done = done[:, np.newaxis]
with tf.GradientTape() as tape:
next_actions = self.actor_target(next_states)
next_q = self.critic_target([next_states, next_actions])
target = rewards + (1 - done) * args.gamma * next_q
q = self.critic([states, actions])
td_error = tf.losses.mean_squared_error(target, q)
critic_grads = tape.gradient(td_error, self.critic.trainable_weights)
self.critic_opt.apply_gradients(zip(critic_grads, self.critic.trainable_weights))
with tf.GradientTape() as tape:
actions = self.actor(states)
q = self.critic([states, actions])
actor_loss = -tf.reduce_mean(q)
actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights)
self.actor_opt.apply_gradients(zip(actor_grads, self.actor.trainable_weights))
self.ema_update()
- DPG: g = ∂ q ( s , π ( s ; θ ) ; w ) ∂ θ g=\frac{∂q(s,π(s;θ);w)}{∂θ} g=∂θ∂q(s,π(s;θ);w)
- actor_loss = -tf.reduce_mean(q)
- DPG与PG不同,DPG是q函数对θ求导。
- 加“-”号的原因是:tf一般是对loss函数做梯度下降,是要使得loss越来越小,前面加个负号再对得到的函数做梯度下降,就相当于对目标的函数做梯度上升,就可以让loss越来越大。
- 至于为什么要使用reduce_mean求平均,可能是出于实践经验。
3.5.train
import time
def train(self,train_episodes):
t0 = time.time()
if args.train:
self.loadModel()
all_episode_reward = []
for episode in range(train_episodes):
state = env.reset().astype(np.float32)
episode_reward = 0
for step in range(args.max_steps):
if args.render:
env.render()
action = agent.get_action(state)
next_state, reward, done, info = env.step(action)
next_state = np.array(next_state, dtype=np.float32)
done = 1 if done is True else 0
self.replay_buffer.push(state, action, reward, next_state, done)
if len(self.replay_buffer) >= args.batch_size:
agent.replay()
state = next_state
episode_reward += reward
if done:
break
if episode == 0:
all_episode_reward.append(episode_reward)
else:
all_episode_reward.append(all_episode_reward[-1] * 0.9 + episode_reward * 0.1)
print(
'Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, train_episodes, episode_reward,
time.time() - t0
)
)
if episode%100==0:
self.saveModel()
plt.plot(all_episode_reward)
else:
self.loadModel()
self.test_episode(args.test_episodes)
3.6.test_episode
帧画面转化为gif函数
from matplotlib import animation
import matplotlib.pyplot as plt
def display_frames_as_gif(frames, path):
patch = plt.imshow(frames[0])
plt.axis('off')
def animate(i):
patch.set_data(frames[i])
anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=5)
anim.save(path, writer='pillow', fps=30)
test_episode
def test_episode(self, test_episodes):
t0 = time.time()
for episode in range(test_episodes):
state = env.reset().astype(np.float32)
episode_reward = 0
frames = []
for step in range(args.max_steps):
env.render()
frames.append(self.env.render(mode='rgb_array'))
next_state, reward, done, info = env.step(agent.get_action(state, greedy=True))
next_state = next_state.astype(np.float32)
state = next_state
episode_reward += reward
if done:
break
print(
'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, test_episodes, episode_reward,
time.time() - t0
)
)
dir_path = os.path.join('testVideo', '_'.join([ALG_NAME, ENV_ID]))
if not os.path.exists(dir_path):
os.makedirs(dir_path)
display_frames_as_gif(frames, dir_path + '\\' + str(episode) + ".gif")
3.7.saveModel
import os
def saveModel(self):
path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
if not os.path.exists(path):
os.makedirs(path)
tl.files.save_weights_to_hdf5(os.path.join(path, 'actor.hdf5'), self.actor)
tl.files.save_weights_to_hdf5(os.path.join(path, 'actor_target.hdf5'), self.actor_target)
tl.files.save_weights_to_hdf5(os.path.join(path, 'critic.hdf5'), self.critic)
tl.files.save_weights_to_hdf5(os.path.join(path, 'critic_target.hdf5'), self.critic_target)
print('Saved weights.')
3.8.loadModel
def loadModel(self):
path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
if os.path.exists(path):
print('Load DQN Network parametets ...')
tl.files.load_hdf5_to_weights_in_order(os.path.join(path, 'actor.hdf5'), self.actor)
tl.files.load_hdf5_to_weights_in_order(os.path.join(path, 'actor_target.hdf5'), self.actor_target)
tl.files.load_hdf5_to_weights_in_order(os.path.join(path, 'critic.hdf5'), self.critic)
tl.files.load_hdf5_to_weights_in_order(os.path.join(path, 'critic_target.hdf5'), self.critic_target)
print('Load weights!')
else:
print("No model file find, please train model first...")
4.主程序
if __name__ == '__main__':
env = gym.make(ENV_ID)
env.seed(args.random_seed)
np.random.seed(args.random_seed)
tf.random.set_seed(args.random_seed)
agent = Agent(env)
agent.train(train_episodes=args.train_episodes)
env.close()
训练结果
2000次