DDPG DDPG介绍2
ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测
公式推导 推导
代码实现的gym的pendulum游戏,这个游戏是连续动作的
pendulum环境介绍
代码实践
""" Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning. DDPG is Actor Critic based algorithm. Pendulum example. View more on my tutorial page: https://morvanzhou.github.io/tutorials/ Using: tensorflow 1.0 gym 0.8.0 """ import tensorflow as tf import numpy as np import gym import time np.random.seed(1) tf.set_random_seed(1) ##################### hyper parameters #################### MAX_EPISODES = 200 MAX_EP_STEPS = 200 lr_a = 0.001 # learning rate for actor lr_c = 0.001 # learning rate for critic gamma = 0.9 # reward discount REPLACEMENT = [ dict(name='soft', tau=0.01), dict(name='hard', rep_iter_a=600, rep_iter_c=500) ][0] # you can try different target replacement strategies MEMORY_CAPACITY = 10000 BATCH_SIZE = 32 RENDER = True OUTPUT_GRAPH = True ENV_NAME = 'Pendulum-v0' ############################### Actor #################################### class Actor(object): def __init__(self, sess, action_dim, action_bound, learning_rate, replacement): self.sess = sess self.a_dim = action_dim self.action_bound = action_bound self.lr = learning_rate self.replacement = replacement self.t_replace_counter = 0 with tf.variable_scope('Actor'): # 这个网络用于及时更新参数 # input s, output a self.a = self._build_net(S, scope='eval_net', trainable=True) ##这个网络不及时更新参数, 用于预测action # input s_, output a, get a_ for critic self.a_ = self._build_net(S_, scope='target_net', trainable=False) self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') if self.replacement['name'] == 'hard': self.t_replace_counter = 0 self.hard_replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)] else: self.soft_replace = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e) for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, scope, trainable):#根据state预测action的网络 with tf.variable_scope(scope): init_w = tf.random_normal_initializer(0., 0.3) init_b = tf.constant_initializer(0.1) net = tf.layers.dense(s, 30, activation=tf.nn.relu, kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable) with tf.variable_scope('a'): actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, bias_initializer=init_b, name='a', trainable=trainable) scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound return scaled_a def learn(self, s): # batch update self.sess.run(self.train_op, feed_dict={S: s}) if self.replacement['name'] == 'soft': self.sess.run(self.soft_replace) else: if self.t_replace_counter % self.replacement['rep_iter_a'] == 0: self.sess.run(self.hard_replace) self.t_replace_counter += 1 def choose_action(self, s): s = s[np.newaxis, :] # single state return self.sess.run(self.a, feed_dict={S: s})[0] # single action def add_grad_to_graph(self, a_grads): with tf.variable_scope('policy_grads'): # ys = policy; # xs = policy's parameters; # a_grads = the gradients of the policy to get more Q # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) with tf.variable_scope('A_train'): opt = tf.train.AdamOptimizer(-self.lr) # (- learning rate) for ascent policy self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))#对eval_net的参数更新 ############################### Critic #################################### class Critic(object): def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, replacement, a, a_): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.lr = learning_rate self.gamma = gamma self.replacement = replacement with tf.variable_scope('Critic'): # Input (s, a), output q self.a = tf.stop_gradient(a) # stop critic update flows to actor # 这个网络用于及时更新参数 self.q = self._build_net(S, self.a, 'eval_net', trainable=True) # 这个网络不及时更新参数, 用于评价actor # Input (s_, a_), output q_ for q_target self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') with tf.variable_scope('target_q'): self.target_q = R + self.gamma * self.q_#target计算 with tf.variable_scope('TD_error'): self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))#计算loss with tf.variable_scope('C_train'): self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)#训练 with tf.variable_scope('a_grad'): self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim) if self.replacement['name'] == 'hard': self.t_replace_counter = 0 self.hard_replacement = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)] else: self.soft_replacement = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e) for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, a, scope, trainable):#Q网络,计算Q(s,a) with tf.variable_scope(scope): init_w = tf.random_normal_initializer(0., 0.1) init_b = tf.constant_initializer(0.1) with tf.variable_scope('l1'): n_l1 = 30 w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) with tf.variable_scope('q'): q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a) return q def learn(self, s, a, r, s_): self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_}) if self.replacement['name'] == 'soft': self.sess.run(self.soft_replacement) else: if self.t_replace_counter % self.replacement['rep_iter_c'] == 0: self.sess.run(self.hard_replacement) self.t_replace_counter += 1 ##################### Memory #################### class Memory(object): def __init__(self, capacity, dims): self.capacity = capacity self.data = np.zeros((capacity, dims)) self.pointer = 0 def store_transition(self, s, a, r, s_): transition = np.hstack((s, a, [r], s_)) index = self.pointer % self.capacity # replace the old memory with new memory self.data[index, :] = transition self.pointer += 1 def sample(self, n): assert self.pointer >= self.capacity, 'Memory has not been fulfilled' indices = np.random.choice(self.capacity, size=n) return self.data[indices, :] import pdb; pdb.set_trace() env = gym.make(ENV_NAME) env = env.unwrapped env.seed(1) state_dim = env.observation_space.shape[0]#3 action_dim = env.action_space.shape[0]#1 连续动作,一维 action_bound = env.action_space.high#[2] # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'): S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_') sess = tf.Session() # Create actor and critic. # They are actually connected to each other, details can be seen in tensorboard or in this picture: actor = Actor(sess, action_dim, action_bound, lr_a, REPLACEMENT) critic = Critic(sess, state_dim, action_dim, lr_c, gamma, REPLACEMENT, actor.a, actor.a_) actor.add_grad_to_graph(critic.a_grads)# # 将 critic 产出的 dQ/da 加入到 Actor 的 Graph 中去 sess.run(tf.global_variables_initializer()) M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1) if OUTPUT_GRAPH: tf.summary.FileWriter("logs/", sess.graph) var = 3 # control exploration t1 = time.time() for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER: env.render() # Add exploration noise a = actor.choose_action(s) a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration s_, r, done, info = env.step(a) M.store_transition(s, a, r / 10, s_) if M.pointer > MEMORY_CAPACITY: var *= .9995 # decay the action randomness b_M = M.sample(BATCH_SIZE) b_s = b_M[:, :state_dim] b_a = b_M[:, state_dim: state_dim + action_dim] b_r = b_M[:, -state_dim - 1: -state_dim] b_s_ = b_M[:, -state_dim:] critic.learn(b_s, b_a, b_r, b_s_) actor.learn(b_s) s = s_ ep_reward += r if j == MAX_EP_STEPS-1: print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, ) if ep_reward > -300: RENDER = True break print('Running time: ', time.time()-t1)