引言
本节将介绍一个对抗DQN的实现过程,这基本上与常规的DQN相同,只是将全连接层分解为两个分支,即值分支和优势分支,且这两个分支最终汇聚在一起来计算Q函数。另外,还将分析基于对抗DQN如何训练一个智能体来赢得赛车游戏。
代码实现
1. 环境封装函数
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Box
from scipy.misc import imresize
import random
import cv2
import time
import logging
import os
import sys
定义 EnvWrapper 类,并定义一些环境封装函数:
class EnvWrapper:
# First we define the __init__ method and initialize variables
def __init__(self, env_name, debug=False):
# environment name
self.env_name = env_name
# initialize the gym environment
self.env = gym.make(env_name)
# get the action space
self.action_space = self.env.action_space
# get the observation space
self.observation_space = Box(low=0, high=255, shape=(84, 84, 4))
# initialize frame_num for storing the frame count 初始化保存帧数的frame_num
self.frame_num = 0
# For recording the game screen # 初始化记录游戏画面的monitor
self.monitor = self.env.monitor
# initialize frames
self.frames = np.zeros((84, 84, 4), dtype=np.uint8)
# initialize a boolean called debug when set true last few frames will be displayed
# 初始化一个称为debug的布尔变量,若设为true,则显示最近几帧游戏画面:
self.debug = debug
if self.debug:
cv2.startWindowThread()
cv2.namedWindow("Game")
# we define the function called step where we perform some action in the
# environment, receive reward and move to the next state
# step function will take the current state as input and returns the preprocessed frame as next state
# 定义一个step函数,以当前状态为输入,并返回预处理后的下一状态帧
def step(self, a):
ob, reward, done, xx = self.env.step(a)
return self.process_frame(ob), reward, done, xx
# We define the helper function called reset for resetting the environment
# after resetting it will return the preprocessed game screen
# 定义一个reset函数来重置环境,环境重置后,将会返回预处理后的游戏画面
def reset(self):
self.frame_num = 0
return self.process_frame(self.env.reset())
# next we define another helper function for rendering the environment 渲染环境
def render(self):
return self.env.render()
# now we define the function called process_frame for preprocessing the frame
# 定义process_frame函数来预处理游戏画面帧
def process_frame(self, frame):
# convert the image to gray 将图像转为灰度图
state_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# change the size
state_resized = cv2.resize(state_gray,(84,110))
# resize
gray_final = state_resized[16:100,:]
if self.frame_num == 0:
self.frames[:, :, 0] = gray_final
self.frames[:, :, 1] = gray_final
self.frames[:, :, 2] = gray_final
self.frames[:, :, 3] = gray_final
else:
self.frames[:, :, 3] = self.frames[:, :, 2]
self.frames[:, :, 2] = self.frames[:, :, 1]
self.frames[:, :, 1] = self.frames[:, :, 0]
self.frames[:, :, 0] = gray_final
# increment the frame_num counter
self.frame_num += 1
if self.debug:
cv2.imshow('Game', gray_final)
return self.frames.copy()
经过预处理后,游戏画面如图所示:
2. 对抗网络
现在,构建一个对抗DQN。首先构建3个卷积层,然后是两个全连接层,其中最后一个全连接层分解为两个独立的层,分别为值分支和优势分支。接着,通过汇聚层将值分支和优势分支相结合来计算 q q q 值。这些层的维度为:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Box
from scipy.misc import imresize
import random
import cv2
import time
import logging
import os
import sys
class QNetworkDueling():
# we define the init method for initializing all layers,
def __init__(self, input_size, output_size, name):
self.name = name
self.input_size = input_size
self.output_size = output_size
with tf.variable_scope(self.name):
# Three convolutional layers
self.W_conv1 = self.weight_variable([8, 8, 4, 32])
self.B_conv1 = self.bias_variable([32])
self.stride1 = 4
self.W_conv2 = self.weight_variable([4, 4, 32, 64])
self.B_conv2 = self.bias_variable([64])
self.stride2 = 2
self.W_conv3 = self.weight_variable([3, 3, 64, 64])
self.B_conv3 = self.bias_variable([64])
self.stride3 = 1
# fully connected layer 1
self.W_fc4a = self.weight_variable([7*7*64, 512])
self.B_fc4a = self.bias_variable([512])
# fully connected layer 2
self.W_fc4b = self.weight_variable([7*7*64, 512])
self.B_fc4b = self.bias_variable([512])
# value stream 值分支
self.W_fc5a = self.weight_variable([512, 1])
self.B_fc5a = self.bias_variable([1])
# advantage stream 优势分支
self.W_fc5b = self.weight_variable([512, self.output_size])
self.B_fc5b = self.bias_variable([self.output_size])
# print number of parameters in the network
self.print_num_of_parameters()
# Now we define the method called __call_ to perform the convolutional operation
# 执行卷积运算
def __call__(self, input_tensor):
if type(input_tensor) == list:
input_tensor = tf.concat(1, input_tensor)
with tf.variable_scope(self.name):
# Perform convolutional operation on three layers
self.h_conv1 = tf.nn.relu( tf.nn.conv2d(input_tensor, self.W_conv1, strides=[1, self.stride1, self.stride1, 1], padding='VALID') + self.B_conv1 )
self.h_conv2 = tf.nn.relu( tf.nn.conv2d(self.h_conv1, self.W_conv2, strides=[1, self.stride2, self.stride2, 1], padding='VALID') + self.B_conv2 )
self.h_conv3 = tf.nn.relu( tf.nn.conv2d(self.h_conv2, self.W_conv3, strides=[1, self.stride3, self.stride3, 1], padding='VALID') + self.B_conv3 )
# Flatten the convolutional output 卷积输出扁平化
self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 7*7*64])
# Input the flattened convolutional layer output to the fully connected layer 链接层
self.h_fc4a = tf.nn.relu(tf.matmul(self.h_conv3_flat, self.W_fc4a) + self.B_fc4a)
self.h_fc4b = tf.nn.relu(tf.matmul(self.h_conv3_flat, self.W_fc4b) + self.B_fc4b)
# Compute value stream and advantage stream 计算值分支和优势分支
self.h_fc5a_value = tf.identity(tf.matmul(self.h_fc4a, self.W_fc5a) + self.B_fc5a)
self.h_fc5b_advantage = tf.identity(tf.matmul(self.h_fc4b, self.W_fc5b) + self.B_fc5b)
# Combine the both value and advantage stream to get the Q value 合并值分支和优势分支
self.h_fc6 = self.h_fc5a_value + ( self.h_fc5b_advantage - tf.reduce_mean(self.h_fc5b_advantage, reduction_indices=[1,], keep_dims=True) )
return self.h_fc6
3. 回放记忆
构建经验回放缓存,这用于保存所有智能体的经验。从回放缓存中采样小批量的经验来训练网络:
class ReplayMemoryFast:
# first we define init method and initialize buffer size 初始化缓存大小
def __init__(self, memory_size, minibatch_size):
# max number of samples to store 保存最大的样本数
self.memory_size = memory_size
# mini batch size
self.minibatch_size = minibatch_size
self.experience = [None]*self.memory_size
self.current_index = 0
self.size = 0
# next we define the function called store for storing the experiences
def store(self, observation, action, reward, newobservation, is_terminal):
# store the experience as a tuple (current state, action, reward, next state, is it a terminal state(终态))
self.experience[self.current_index] = (observation, action, reward, newobservation, is_terminal)
self.current_index += 1
self.size = min(self.size+1, self.memory_size)
# if the index is greater than memory then we flush the index by subtrating it with memory size
# 如果序号超出记忆单元,则通过减去记忆单元大小来刷新序号
if self.current_index >= self.memory_size:
self.current_index -= self.memory_size
# we define a function called sample for sampling the minibatch of experience
# 定义一个sample函数来采样最小批的经验
def sample(self):
if self.size < self.minibatch_size:
return []
# first we randomly sample some indices 随机产生一些样本编号
samples_index = np.floor(np.random.random((self.minibatch_size,))*self.size)
# select the experience from the sampled index 根据样本编号选择经验
samples = [self.experience[int(i)] for i in samples_index]
return samples
4. 训练网络
学习如何训练网络
class DQN(object):
# First we define the class called DQN and initialize all varaiables in __init__ method
def __init__(self, state_size,
action_size,
session,
summary_writer = None,
exploration_period = 1000,
minibatch_size = 32,
discount_factor = 0.99,
experience_replay_buffer = 10000,
target_qnet_update_frequency = 10000,
initial_exploration_epsilon = 1.0,
final_exploration_epsilon = 0.05,
reward_clipping = -1,
):
self.state_size = state_size
self.action_size = action_size
self.session = session
self.exploration_period = float(exploration_period)
self.minibatch_size = minibatch_size
self.discount_factor = tf.constant(discount_factor)
self.experience_replay_buffer = experience_replay_buffer
self.summary_writer = summary_writer
self.reward_clipping = reward_clipping
self.target_qnet_update_frequency = target_qnet_update_frequency
self.initial_exploration_epsilon = initial_exploration_epsilon
self.final_exploration_epsilon = final_exploration_epsilon
self.num_training_steps = 0
# initialize primary DDQN by creating an instance to our QNetworkDueling class
# 通过创建一个QNetworkDueling类的实例来初始化主对抗DQN:
self.qnet = QNetworkDueling(self.state_size, self.action_size, "qnet")
# similarly initialize the Target DDQN 同理,初始化目标对抗DQN
self.target_qnet = QNetworkDueling(self.state_size, self.action_size, "target_qnet")
# Next initialize the optimizer as a RMSPropOptimizer 接着,初始化优化器为RMSPropOptimizer
self.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01)
# Now, initialize experience replay buffer by creating instance to our ReplayMemoryFast class
# 通过创建一个ReplayMemoryFast类来初始化experience_replay_buffer
self.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)
# Setup the computation graph
self.create_graph()
# Next we define the function called copy_to_target_network for copying weights from the
# primary network to our target network 将权重从主网络复制到目标网络
def copy_to_target_network(source_network, target_network):
target_network_update = []
for v_source, v_target in zip(source_network.variables(), target_network.variables()):
# this is equivalent to target = source
update_op = v_target.assign(v_source)
target_network_update.append(update_op)
return tf.group(*target_network_update)
# Now we define the function called create graph and build our computation graph 定义create_graph函数,构建计算图
def create_graph(self):
# we calculate Q values and select the action that has maximum Q value
# 计算q_value,并且选择具有最大q值的行为
with tf.name_scope("pick_action"):
# placeholder for state
self.state = tf.placeholder(tf.float32, (None,)+self.state_size , name="state")
# placeholder for q values
self.q_values = tf.identity(self.qnet(self.state) , name="q_values")
# placeholder for predicted actions 预测行为占位符
self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name="predicted_actions")
# plot as a historgram to track max q values 绘制直方图来得到最大q值
tf.histogram_summary("Q values", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning
# Next we calculate target future reward 计算目标未来奖励
with tf.name_scope("estimating_future_rewards"):
self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name="next_state")
self.next_state_mask = tf.placeholder(tf.float32, (None,) , name="next_state_mask") # 0 for terminal states
self.rewards = tf.placeholder(tf.float32, (None,) , name="rewards")
self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name="next_q_values_targetqnet")
self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name="next_q_values_qnet")
self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)
self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)
self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )
self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values
# perform the optimization 利用RMS比例优化器执行优化
with tf.name_scope("optimization_step"):
self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name="action_mask")
self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])
# clip the errors 错误剪辑
self.error = tf.abs(self.y - self.target_q_values)
quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)
linear_part = self.error - quadratic_part
self.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )
# optimize the gradients 优化梯度
qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())
for i, (grad, var) in enumerate(qnet_gradients):
if grad is not None:
qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)
self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)
# Copy the primary network weights to the target network 将主网络的权重复制给目标网络
with tf.name_scope("target_network_update"):
self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)
# We define the function called store for storing all the experience in the experience replay buffer
# 定义store函数来保存experience_replay_buffer中的全部经验
def store(self, state, action, reward, next_state, is_terminal):
# rewards clipping 奖励剪辑
if self.reward_clipping > 0.0:
reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)
self.experience_replay.store(state, action, reward, next_state, is_terminal)
# We define a function called action for selecting actions using decaying epsilon greedy policy
# 定义一个action函数,根据衰减epsilon贪婪策略来选择行为
def action(self, state, training = False):
if self.num_training_steps > self.exploration_period:
epsilon = self.final_exploration_epsilon
else:
epsilon = self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_period
if not training:
epsilon = 0.05
# execute a random action with probability epsilon, or follow the QNet policy with probability 1-epsilon.
if random.random() <= epsilon:
action = random.randint(0, self.action_size-1)
else:
action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]
return action
# Now we define a function called train for training our network
def train(self):
# Copy the QNetwork weights to the Target QNetwork. 将主网络的权重复制给目标网络
if self.num_training_steps == 0:
print "Training starts..."
self.qnet.copy_to(self.target_qnet)
# Sample experience from replay memory 从回放记忆中采样经验
minibatch = self.experience_replay.sample()
if len(minibatch)==0:
return
# get the states, actions, rewards and next states from the minibatch
batch_states = np.asarray( [d[0] for d in minibatch] )
actions = [d[1] for d in minibatch]
batch_actions = np.zeros( (self.minibatch_size, self.action_size) )
for i in xrange(self.minibatch_size):
batch_actions[i, actions[i]] = 1
batch_rewards = np.asarray( [d[2] for d in minibatch] )
batch_newstates = np.asarray( [d[3] for d in minibatch] )
batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )
# Perform the training operation
scores, _, = self.session.run([self.q_values, self.qnet_optimize],
{ self.state: batch_states,
self.next_state: batch_newstates,
self.next_state_mask: batch_newstates_mask,
self.rewards: batch_rewards,
self.action_mask: batch_actions} )
# 更新目标网络权重
if self.num_training_steps % self.target_qnet_update_frequency == 0:
self.session.run( self.hard_copy_to_target )
# Write logs
print 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))
str_ = self.session.run(self.summarize, { self.state: batch_states,
self.next_state: batch_newstates,
self.next_state_mask: batch_newstates_mask,
self.rewards: batch_rewards,
self.action_mask: batch_actions})
self.summary_writer.add_summary(str_, self.num_training_steps)
self.num_training_steps += 1
5. 赛车游戏
import gym
import time
import logging
import os
import sys
import tensorflow as tf
初始化所有变量
ENV_NAME = 'Seaquest-v0'
TOTAL_FRAMES = 20000000
MAX_TRAINING_STEPS = 20*60*60/3
TESTING_GAMES = 30
MAX_TESTING_STEPS = 5*60*60/3
TRAIN_AFTER_FRAMES = 50000
epoch_size = 50000
MAX_NOOP_START = 30
LOG_DIR = 'logs'
初始化tensorflow会话
logger = tf.train.SummaryWriter(LOG_DIR)
# Intilaize tensorflow session
session = tf.InteractiveSession()
outdir = 'results'
构建智能体
agent = DQN(state_size=env.observation_space.shape,
action_size=env.action_space.n,
session=session,
summary_writer = logger,
exploration_period = 1000000,
minibatch_size = 32,
discount_factor = 0.99,
experience_replay_buffer = 1000000,
target_qnet_update_frequency = 20000,
initial_exploration_epsilon = 1.0,
final_exploration_epsilon = 0.1,
reward_clipping = 1.0,
DoubleDQN = UseDoubleDQN)
保存记录
session.run(tf.initialize_all_variables())
logger.add_graph(session.graph)
saver = tf.train.Saver(tf.all_variables())
env.monitor.start(outdir+'/'+ENV_NAME,force = True, video_callable=multiples_video_schedule)
num_frames = 0
num_games = 0
current_game_frames = 0
init_no_ops = np.random.randint(MAX_NOOP_START+1)
last_time = time.time()
last_frame_count = 0.0
state = env.reset()
开始训练:
while num_frames <= TOTAL_FRAMES+1:
if test_mode:
env.render()
num_frames += 1
current_game_frames += 1
# Select the action given the curent state
action = agent.action(state, training = True)
# Perform the action on the environment, receiver reward and move to the next state
next_state,reward,done,_ = env.step(action)
# store this transistion information in the experience replay buffer
if current_game_frames >= init_no_ops:
agent.store(state,action,reward,next_state,done)
state = next_state
# Train the agent
if num_frames>=TRAIN_AFTER_FRAMES:
agent.train()
if done or current_game_frames > MAX_TRAINING_STEPS:
state = env.reset()
current_game_frames = 0
num_games += 1
init_no_ops = np.random.randint(MAX_NOOP_START+1)
# Save the network's parameters after every epoch
if num_frames % epoch_size == 0 and num_frames > TRAIN_AFTER_FRAMES:
saver.save(session, outdir+"/"+ENV_NAME+"/model_"+str(num_frames/1000)+"k.ckpt")
print "epoch: frames=",num_frames," games=",num_games
# We test the performance for every two epochs
if num_frames % (2*epoch_size) == 0 and num_frames > TRAIN_AFTER_FRAMES:
total_reward = 0
avg_steps = 0
for i in xrange(TESTING_GAMES):
state = env.reset()
init_no_ops = np.random.randint(MAX_NOOP_START+1)
frm = 0
while frm < MAX_TESTING_STEPS:
frm += 1
env.render()
action = agent.action(state, training = False)
if current_game_frames < init_no_ops:
action = 0
state,reward,done,_ = env.step(action)
total_reward += reward
if done:
break
avg_steps += frm
avg_reward = float(total_reward)/TESTING_GAMES
str_ = session.run( tf.scalar_summary('test reward ('+str(epoch_size/1000)+'k)', avg_reward) )
logger.add_summary(str_, num_frames)
print ' --> Evaluation Average Reward: ',avg_reward,' avg steps: ',(avg_steps/TESTING_GAMES)
state = env.reset()
env.monitor.close()
logger.close()
https://github.com/PacktPublishing/Hands-On-Reinforcement-Learning-with-Python/tree/master/Chapter12