完整代码:https://github.com/zle1992/Reinforcement_Learning_Game
论文《Continuous control with deep reinforcement learning》https://arxiv.org/pdf/1509.02971.pdf
Deep_Deterministic_Policy_Gradient
DDPG与AC的区别:
AC:
Actor: 利用td_error更新参数,td_error 来自Critic
Critic:根据value(s)函数的贝尔曼方程更新梯度
DDPG:
Actor: maximize the q,输出action
Critic:根据Q(s,a)函数的贝尔曼方程更新梯度, 输出q值
DDPG 只能预测连续的动作输出。
逻辑梳理:
1、DDPG是AC 模型,输入包括(S,R,S_,A)
2、Actor
intput:(S)
output: a
loss :max(q)
q 来自Critic
3、Critic
input : S 、A
output: q
loss: R+ GAMMA * q_ - q
问题来了,q_ how to get? ---->Critic网络可以输入(S_,a_)得到q_ 但是,不能用同一个网络啊,所以,利用错位时间,我们使用Critic2(不可训练的)
Critic2需要a_ how to get?/----->Action网络可以输出(S_)得到a_,同理,我们使用Actor2(不可训练的)得到a_
流程
a = actor(s ,trian)
a_ = actor(s_,not_train)
q = critic(s,a trian)
q_critic(s_,a_,not_train)
a_loss = max(q)
c_loss = R+ GAMMA * q_ - q
代码:
DDPY.py
1 import os 2 import numpy as np 3 import tensorflow as tf 4 from abc import ABCMeta, abstractmethod 5 np.random.seed(1) 6 tf.set_random_seed(1) 7 8 import logging # 引入logging模块 9 logging.basicConfig(level=logging.DEBUG, 10 format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # logging.basicConfig函数对日志的输出格式及方式做相关配置 11 # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上 12 13 tfconfig = tf.ConfigProto() 14 tfconfig.gpu_options.allow_growth = True 15 session = tf.Session(config=tfconfig) 16 17 18 class DDPG(object): 19 __metaclass__ = ABCMeta 20 """docstring for ACNetwork""" 21 def __init__(self, 22 n_actions, 23 n_features, 24 reward_decay, 25 lr_a, 26 lr_c, 27 memory_size, 28 output_graph, 29 log_dir, 30 model_dir, 31 TAU, 32 a_bound, 33 ): 34 super(DDPG, self).__init__() 35 36 self.n_actions = n_actions 37 self.n_features = n_features 38 self.gamma=reward_decay 39 self.memory_size =memory_size 40 self.output_graph=output_graph 41 self.lr_a =lr_a 42 self.lr_c = lr_c 43 self.log_dir = log_dir 44 45 self.model_dir = model_dir 46 # total learning step 47 self.learn_step_counter = 0 48 self.TAU = TAU # soft replacement 49 self.a_bound = a_bound 50 51 52 53 54 self.s = tf.placeholder(tf.float32,[None]+self.n_features,name='s') 55 self.s_next = tf.placeholder(tf.float32,[None]+self.n_features,name='s_next') 56 self.r = tf.placeholder(tf.float32,[None,],name='r') 57 58 #self.a = tf.placeholder(tf.int32,[None,1],name='a') 59 60 with tf.variable_scope('Actor'): 61 self.a = self._build_a_net(self.s, scope='eval', trainable=True) 62 a_ = self._build_a_net(self.s_next, scope='target', trainable=False) 63 64 with tf.variable_scope('Critic'): 65 66 q = self._build_c_net(self.s, self.a,scope='eval', trainable=True) 67 q_ = self._build_c_net(self.s_next, a_,scope='target', trainable=False) 68 69 # networks parameters 70 self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') 71 self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') 72 self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval') 73 self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target') 74 75 76 77 78 with tf.variable_scope('train_op_actor'): 79 self.loss_actor = -tf.reduce_mean(q) 80 self.train_op_actor = tf.train.AdamOptimizer(self.lr_a).minimize(self.loss_actor,var_list=self.ae_params) 81 82 83 84 85 with tf.variable_scope('train_op_critic'): 86 87 q_target = self.r + self.gamma * q_ 88 self.loss_critic =tf.losses.mean_squared_error(labels=q_target, predictions=q) 89 self.train_op_critic = tf.train.AdamOptimizer(self.lr_c).minimize(self.loss_critic,var_list=self.ce_params) 90 91 92 93 # target net replacement 94 self.soft_replace = [tf.assign(t, (1 - self.TAU) * t + self.TAU * e) 95 for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 96 97 self.sess = tf.Session() 98 if self.output_graph: 99 tf.summary.FileWriter(self.log_dir,self.sess.graph) 100 101 self.sess.run(tf.global_variables_initializer()) 102 103 self.cost_his =[0] 104 self.cost =0 105 106 self.saver = tf.train.Saver() 107 108 if not os.path.exists(self.model_dir): 109 os.mkdir(self.model_dir) 110 111 checkpoint = tf.train.get_checkpoint_state(self.model_dir) 112 if checkpoint and checkpoint.model_checkpoint_path: 113 self.saver.restore(self.sess, checkpoint.model_checkpoint_path) 114 print ("Loading Successfully") 115 self.learn_step_counter = int(checkpoint.model_checkpoint_path.split('-')[-1]) + 1 116 117 118 @abstractmethod 119 def _build_a_net(self,x,scope,trainable): 120 121 raise NotImplementedError 122 def _build_c_net(self,x,scope,trainable): 123 124 raise NotImplementedError 125 def learn(self,data): 126 127 # soft target replacement 128 self.sess.run(self.soft_replace) 129 130 131 batch_memory_s = data['s'] 132 batch_memory_a = data['a'] 133 batch_memory_r = data['r'] 134 batch_memory_s_ = data['s_'] 135 136 _, cost = self.sess.run( 137 [self.train_op_actor, self.loss_actor], 138 feed_dict={ 139 self.s: batch_memory_s, 140 }) 141 142 _, cost = self.sess.run( 143 [self.train_op_critic, self.loss_critic], 144 feed_dict={ 145 self.s: batch_memory_s, 146 self.a: batch_memory_a, 147 self.r: batch_memory_r, 148 self.s_next: batch_memory_s_, 149 150 }) 151 152 153 154 self.cost_his.append(cost) 155 self.cost =cost 156 self.learn_step_counter += 1 157 # save network every 100000 iteration 158 if self.learn_step_counter % 10000 == 0: 159 self.saver.save(self.sess,self.model_dir,global_step=self.learn_step_counter) 160 161 162 163 def choose_action(self,s): 164 165 return self.sess.run(self.a, {self.s: s[np.newaxis,:]})[0] 166 # s = s[np.newaxis,:] 167 168 # probs = self.sess.run(self.acts_prob,feed_dict={self.s:s}) 169 # return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())
game.py
1 import sys 2 import gym 3 import numpy as np 4 import tensorflow as tf 5 sys.path.append('./') 6 sys.path.append('model') 7 8 from util import Memory ,StateProcessor 9 from DDPG import DDPG 10 from ACNetwork import ACNetwork 11 np.random.seed(1) 12 tf.set_random_seed(1) 13 14 import logging # 引入logging模块 15 logging.basicConfig(level=logging.DEBUG, 16 format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # logging.basicConfig函数对日志的输出格式及方式做相关配置 17 # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上 18 import os 19 os.environ["CUDA_VISIBLE_DEVICES"] = "1" 20 tfconfig = tf.ConfigProto() 21 tfconfig.gpu_options.allow_growth = True 22 session = tf.Session(config=tfconfig) 23 24 25 26 class DDPG4Pendulum(DDPG): 27 """docstring for ClassName""" 28 def __init__(self, **kwargs): 29 super(DDPG4Pendulum, self).__init__(**kwargs) 30 31 def _build_a_net(self,s,scope,trainable): 32 w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) 33 #w_initializer, b_initializer = None,None 34 with tf.variable_scope(scope): 35 e1 = tf.layers.dense(inputs=s, 36 units=30, 37 bias_initializer = b_initializer, 38 kernel_initializer=w_initializer, 39 activation = tf.nn.relu, 40 trainable=trainable) 41 a = tf.layers.dense(inputs=e1, 42 units=self.n_actions, 43 bias_initializer = b_initializer, 44 kernel_initializer=w_initializer, 45 activation = tf.nn.tanh, 46 trainable=trainable) 47 48 return tf.multiply(a, self.a_bound, name='scaled_a') 49 50 def _build_c_net(self,s,a,scope,trainable): 51 w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) 52 53 with tf.variable_scope(scope): 54 n_l1 = 30 55 w1_s = tf.get_variable('w1_s',self.n_features+[n_l1],trainable=trainable) 56 w1_a = tf.get_variable('w1_a',[self.n_actions,n_l1],trainable=trainable) 57 b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) 58 net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) 59 60 q = tf.layers.dense(inputs=net, 61 units=1, 62 bias_initializer = b_initializer, 63 kernel_initializer=w_initializer, 64 activation =None, 65 trainable=trainable) 66 67 return q 68 69 70 71 batch_size = 32 72 73 memory_size =10000 74 env = gym.make('Pendulum-v0') #连续 75 76 77 n_features= [env.observation_space.shape[0]] 78 n_actions= env.action_space.shape[0] 79 a_bound = env.action_space.high 80 env = env.unwrapped 81 MAX_EP_STEPS =200 82 def run(): 83 84 RL = DDPG4Pendulum( 85 n_actions=n_actions, 86 n_features=n_features, 87 reward_decay=0.9, 88 lr_a = 0.001, 89 lr_c = 0.002, 90 memory_size=memory_size, 91 TAU = 0.01, 92 output_graph=False, 93 log_dir = 'Pendulum/log/DDPG4Pendulum/', 94 a_bound =a_bound, 95 model_dir = 'Pendulum/model_dir/DDPG4Pendulum/' 96 ) 97 98 memory = Memory(n_actions,n_features,memory_size=memory_size) 99 100 var = 3 # control exploration 101 step = 0 102 103 for episode in range(2000): 104 # initial observation 105 observation = env.reset() 106 ep_r = 0 107 108 for j in range(MAX_EP_STEPS): 109 110 # RL choose action based on observation 111 action = RL.choose_action(observation) 112 action = np.clip(np.random.normal(action, var), -2, 2) # add randomness to action selection for exploration 113 # RL take action and get_collectiot next observation and reward 114 observation_, reward, done, info=env.step(action) # take a random action 115 116 #print('step:%d---episode:%d----reward:%f---action:%f'%(step,episode,reward,action)) 117 memory.store_transition(observation, action, reward/10, observation_) 118 119 if step > memory_size: 120 #env.render() 121 var *= .9995 # decay the action randomness 122 data = memory.sample(batch_size) 123 RL.learn(data) 124 125 # swap observation 126 observation = observation_ 127 ep_r += reward 128 # break while loop when end of this episode 129 if(episode>200): 130 env.render() # render on the screen 131 if j == MAX_EP_STEPS-1: 132 print('step: ',step, 133 'episode: ', episode, 134 'ep_r: ', round(ep_r, 2), 135 'var:',var, 136 #loss: ',RL.cost 137 ) 138 break 139 step += 1 140 141 # end of game 142 print('game over') 143 env.destroy() 144 145 def main(): 146 147 run() 148 149 150 151 if __name__ == '__main__': 152 main() 153 #run2()