目录
- 基于tf2.x实现DDPG
-
- 1. 相关依赖库
- 2. DDPG代码实现
- 3. gym环境测试
- 基于tf1.x实现DDPG
-
- 1. 相关依赖库
- 2. DDPG代码实现
- 3. gym环境测试
- 参考资料
基于tf2.x实现DDPG
1. 相关依赖库
- tensorflow==2.7.0
- gym==0.24.0
- python==3.8.0
- 测试运行环境——windows/Mac
2. DDPG代码实现
import tensorflow as tf
from tensorflow import keras
import numpy as np
class DDPG(keras.Model):
def __init__(self, a_dim, s_dim, a_bound, batch_size=32, tau=0.002, gamma=0.95,
a_lr=0.0001, c_lr=0.001, memory_capacity=9000):
super().__init__()
self.batch_size = batch_size
self.tau = tau
self.gamma = gamma
self.a_lr = a_lr
self.c_lr = c_lr
self.memory_capacity = memory_capacity
self.memory = np.zeros((memory_capacity, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.memory_full = False
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound[1]
s = keras.Input(shape=(s_dim,))
s_ = keras.Input(shape=(s_dim,))
self.actor = self._build_actor(trainable=True, name="a/eval")
self.actor_ = self._build_actor(trainable=False, name="a/target")
self.actor_.set_weights(self.actor.get_weights())
self.critic = self._build_critic(trainable=True, name="d/eval")
self.critic_ = self._build_critic(trainable=False, name="d/target")
self.critic_.set_weights(self.critic.get_weights())
self.a_opt = keras.optimizers.Adam(self.a_lr)
self.c_opt = keras.optimizers.Adam(self.c_lr)
self.mse = keras.losses.MeanSquaredError()
def _build_actor(self, trainable, name):
data = keras.Input(shape=(self.s_dim,))
x = keras.layers.Dense(30, activation="relu", trainable=trainable)(data)
x = keras.layers.Dense(30, activation="relu", trainable=trainable)(x)
x = keras.layers.Dense(self.a_dim, trainable=trainable)(x)
a = self.a_bound * tf.math.tanh(x)
model = keras.Model(data, a, name=name)
return model
def _build_critic(self, trainable, name):
data = keras.Input(shape=(self.a_dim + self.s_dim,))
x = keras.layers.Dense(30, activation="relu", trainable=trainable)(data)
x = keras.layers.Dense(30, activation="relu", trainable=trainable)(x)
q = keras.layers.Dense(1, trainable=trainable)(x)
model = keras.Model(data, q, name=name)
return model
def param_replace(self):
actor_weights = self.actor.get_weights()
critic_weights = self.critic.get_weights()
actor_target_weights = self.actor_.get_weights()
critic_target_weights = self.critic_.get_weights()
for i in range(len(actor_target_weights)):
actor_target_weights[i] = actor_target_weights[i] * (1 - self.tau) + self.tau * actor_weights[i]
for i in range(len(critic_target_weights)):
critic_target_weights[i] = critic_target_weights[i] * (1 - self.tau) + self.tau * critic_weights[i]
self.actor_.set_weights(actor_target_weights)
self.critic_.set_weights(critic_target_weights)
def act(self, s):
a = self.actor.predict(np.reshape(s, (-1, self.s_dim)), verbose=0)[0]
return a
def sample_memory(self):
indices = np.random.choice(self.memory_capacity, size=self.batch_size)
bt = self.memory[indices, :]
bs = bt[:, :self.s_dim]
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
br = bt[:, -self.s_dim - 1: -self.s_dim]
bs_ = bt[:, -self.s_dim:]
return bs, ba, br, bs_
def learn(self):
bs, ba, br, bs_ = self.sample_memory()
with tf.GradientTape() as tape: 更新策略网络
a = self.actor(bs)
q = self.critic(tf.concat([bs, a], 1))
actor_loss = tf.reduce_mean(-q)
grads = tape.gradient(actor_loss, self.actor.trainable_variables)
self.a_opt.apply_gradients(zip(grads, self.actor.trainable_variables))
with tf.GradientTape() as tape:
a_ = self.actor_(bs_)
q_ = br + self.gamma * self.critic_(tf.concat([bs_, a_], 1))
q = self.critic(tf.concat([bs, ba], 1))
critic_loss = self.mse(q_, q)
grads = tape.gradient(critic_loss, self.critic.trainable_variables)
self.c_opt.apply_gradients(zip(grads, self.critic.trainable_variables))
return actor_loss.numpy(), critic_loss.numpy()
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.memory_capacity
self.memory[index, :] = transition
self.pointer += 1
3. gym环境测试
import gym
import numpy as np
model = DDPG(a_dim=1, s_dim=3, a_bound=[-2, 2], batch_size=128, tau=0.01, gamma=0.9, memory_capacity=10000)
env = gym.make('Pendulum-v1')
RENDER = False
env = env.unwrapped
max_ep_step = 200
var = 3
for ep in range(200):
s = env.reset(seed=1)
ep_reward = 0
for step in range(max_ep_step):
if RENDER:
env.render()
a = model.act(s)
a = np.clip(np.random.normal(a, var), -2, 2)
s_, r, done, info = env.step(a)
model.store_transition(s, a, r/10, s_)
if model.pointer > 10000:
var *= .9995
model.learn()
model.param_replace()
s = s_
ep_reward += r
if step == max_ep_step -1:
print('Episode:', ep, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break
env.close()
基于tf1.x实现DDPG
1. 相关依赖库
- tensorflow==1.7.0
- gym==0.25.0
- python==3.6.0
- 测试运行环境——Windows
2. DDPG代码实现
import tensorflow as tf
import numpy as np
LR_A = 0.001
LR_C = 0.001
GAMMA = 0.9
TAU = 0.01
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32
class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound):
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.sess = tf.Session()
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self._build_a(self.S, scope='eval', trainable=True)
a_ = self._build_a(self.S_, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self._build_c(self.S, self.a, scope='eval', trainable=True)
q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
self.soft_replace = [[tf.assign(ta, (1 - TAU) * ta + TAU * ea), tf.assign(tc, (1 - TAU) * tc + TAU * ec)]
for ta, ea, tc, ec in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.R + GAMMA * q_
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
self.policy_grads = tf.gradients(ys=self.a, xs=self.ae_params, grad_ys=tf.gradients(q, self.a)[0])
self.atrain = tf.train.AdamOptimizer(-LR_A).apply_gradients(zip(self.policy_grads, self.ae_params))
self.sess.run(tf.global_variables_initializer())
def choose_action(self, s):
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
def learn(self):
self.sess.run(self.soft_replace)
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
bt = self.memory[indices, :]
bs = bt[:, :self.s_dim]
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
br = bt[:, -self.s_dim - 1: -self.s_dim]
bs_ = bt[:, -self.s_dim:]
self.sess.run(self.atrain, {self.S: bs})
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % MEMORY_CAPACITY
self.memory[index, :] = transition
self.pointer += 1
def _build_a(self, s, scope, trainable):
with tf.variable_scope(scope):
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name='scaled_a')
def _build_c(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
return tf.layers.dense(net, 1, trainable=trainable)
3. gym环境测试
import numpy as np
import gym
from RL_brain import DDPG
MAX_EPISODES = 200
MAX_EP_STEPS = 200
MEMORY_CAPACITY = 10000
RENDER = False
ENV_NAME = 'Pendulum-v1'
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
var = 3
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2)
s_, r, done, info, _ = env.step(a)
ddpg.store_transition(s, a, r / 10, s_)
if ddpg.pointer > MEMORY_CAPACITY:
var *= .9995
ddpg.learn()
s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break
参考资料
- tf1.x代码实现 —— 书籍《强化学习》,邹伟,鬲玲,刘昱杓著,清华大学出版社;
- tf2.x代码实现 —— morvan强化学习机械臂,gitee链接:https://gitee.com/TIPE/robotArm