深度强化学习之Capstone项目——基于DQN的赛车游戏

引言

  本节将介绍一个对抗DQN的实现过程,这基本上与常规的DQN相同,只是将全连接层分解为两个分支,即值分支和优势分支,且这两个分支最终汇聚在一起来计算Q函数。另外,还将分析基于对抗DQN如何训练一个智能体来赢得赛车游戏。

代码实现

1. 环境封装函数

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Box
from scipy.misc import imresize
import random
import cv2
import time
import logging
import os
import sys

定义 EnvWrapper 类,并定义一些环境封装函数:

class EnvWrapper:


    # First we define the __init__ method and initialize variables

    def __init__(self, env_name, debug=False):
        

        # environment name
        self.env_name = env_name
        
        # initialize the gym environment
        self.env = gym.make(env_name)

        # get the action space
        self.action_space = self.env.action_space

        # get the observation space
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 4))  

        # initialize frame_num for storing the frame count  初始化保存帧数的frame_num
        self.frame_num = 0

        # For recording the game screen  # 初始化记录游戏画面的monitor
        self.monitor = self.env.monitor

        # initialize frames
        self.frames = np.zeros((84, 84, 4), dtype=np.uint8)

        # initialize a boolean called debug when set true last few frames will be displayed
        # 初始化一个称为debug的布尔变量,若设为true,则显示最近几帧游戏画面:
        self.debug = debug

        if self.debug:
            cv2.startWindowThread()
            cv2.namedWindow("Game")


    # we define the function called step where we perform some action in the 
    # environment, receive reward and move to the next state 
    # step function will take the current state as input and returns the preprocessed frame as next state
    # 定义一个step函数,以当前状态为输入,并返回预处理后的下一状态帧

    def step(self, a):
        ob, reward, done, xx = self.env.step(a)
        return self.process_frame(ob), reward, done, xx


    # We define the helper function called reset for resetting the environment
    # after resetting it will return the preprocessed game screen
    # 定义一个reset函数来重置环境,环境重置后,将会返回预处理后的游戏画面
    def reset(self):
        self.frame_num = 0
        return self.process_frame(self.env.reset())


    # next we define another helper function for rendering the environment 渲染环境
    def render(self):
        return self.env.render()


    # now we define the function called process_frame for preprocessing the frame
    # 定义process_frame函数来预处理游戏画面帧
    def process_frame(self, frame):

        # convert the image to gray  将图像转为灰度图
        state_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # change the size
        state_resized = cv2.resize(state_gray,(84,110))
        
        # resize
        gray_final = state_resized[16:100,:]


        if self.frame_num == 0:
            self.frames[:, :, 0] = gray_final
            self.frames[:, :, 1] = gray_final
            self.frames[:, :, 2] = gray_final
            self.frames[:, :, 3] = gray_final

        else:
            self.frames[:, :, 3] = self.frames[:, :, 2]
            self.frames[:, :, 2] = self.frames[:, :, 1]
            self.frames[:, :, 1] = self.frames[:, :, 0]
            self.frames[:, :, 0] = gray_final

        
        # increment the frame_num counter

        self.frame_num += 1

        if self.debug:
            cv2.imshow('Game', gray_final)

        return self.frames.copy()

经过预处理后,游戏画面如图所示:


2. 对抗网络

   现在,构建一个对抗DQN。首先构建3个卷积层,然后是两个全连接层,其中最后一个全连接层分解为两个独立的层,分别为值分支和优势分支。接着,通过汇聚层将值分支和优势分支相结合来计算 q q q 值。这些层的维度为:

  • 层1 :32步幅为4的8x8滤波器+ReLU
  • 层2 :64步幅为2的4x4滤波器+ReLU
  • 层3 :64步幅为1的3x3滤波器+ReLU
  • 层4a : 512个单元的全连接+ReLU
  • 层4b : 512个单元的全连接+ReLU
  • 层5a:1个单元的FC+ReLU(状态值)
  • 层5b : 行为FC+ReLU(优势值)
  • 层6 :汇聚层 V(s) +A(s,a)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Box
from scipy.misc import imresize
import random
import cv2
import time
import logging
import os
import sys
class QNetworkDueling():
  
    # we define the init method for initializing all layers,

    def __init__(self, input_size, output_size, name):
        self.name = name
        self.input_size = input_size
        self.output_size = output_size


        with tf.variable_scope(self.name):


            # Three convolutional layers
            self.W_conv1 = self.weight_variable([8, 8, 4, 32]) 
            self.B_conv1 = self.bias_variable([32])
            self.stride1 = 4

            self.W_conv2 = self.weight_variable([4, 4, 32, 64])
            self.B_conv2 = self.bias_variable([64])
            self.stride2 = 2

            self.W_conv3 = self.weight_variable([3, 3, 64, 64])
            self.B_conv3 = self.bias_variable([64])
            self.stride3 = 1

            # fully connected layer 1
            self.W_fc4a = self.weight_variable([7*7*64, 512])
            self.B_fc4a = self.bias_variable([512])

            # fully connected layer 2
            self.W_fc4b = self.weight_variable([7*7*64, 512])
            self.B_fc4b = self.bias_variable([512])

            # value stream  值分支
            self.W_fc5a = self.weight_variable([512, 1])
            self.B_fc5a = self.bias_variable([1])

            # advantage stream 优势分支
            self.W_fc5b = self.weight_variable([512, self.output_size])
            self.B_fc5b = self.bias_variable([self.output_size])


        # print number of parameters in the network
        self.print_num_of_parameters()



    # Now we define the method called __call_ to perform the convolutional operation 
    # 执行卷积运算

    def __call__(self, input_tensor):
        if type(input_tensor) == list:
            input_tensor = tf.concat(1, input_tensor)

        with tf.variable_scope(self.name):

            # Perform convolutional operation on three layers
            self.h_conv1 = tf.nn.relu( tf.nn.conv2d(input_tensor, self.W_conv1, strides=[1, self.stride1, self.stride1, 1], padding='VALID') + self.B_conv1 )
            self.h_conv2 = tf.nn.relu( tf.nn.conv2d(self.h_conv1, self.W_conv2, strides=[1, self.stride2, self.stride2, 1], padding='VALID') + self.B_conv2 )
            self.h_conv3 = tf.nn.relu( tf.nn.conv2d(self.h_conv2, self.W_conv3, strides=[1, self.stride3, self.stride3, 1], padding='VALID') + self.B_conv3 )

            # Flatten the convolutional output  卷积输出扁平化
            self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 7*7*64])


            # Input the flattened convolutional layer output to the fully connected layer  链接层
            self.h_fc4a = tf.nn.relu(tf.matmul(self.h_conv3_flat, self.W_fc4a) + self.B_fc4a)
            self.h_fc4b = tf.nn.relu(tf.matmul(self.h_conv3_flat, self.W_fc4b) + self.B_fc4b)


            # Compute value stream and advantage stream   计算值分支和优势分支
            self.h_fc5a_value     = tf.identity(tf.matmul(self.h_fc4a, self.W_fc5a) + self.B_fc5a)
            self.h_fc5b_advantage = tf.identity(tf.matmul(self.h_fc4b, self.W_fc5b) + self.B_fc5b)

            # Combine the both value and advantage stream to get the Q value 合并值分支和优势分支
            self.h_fc6 = self.h_fc5a_value + ( self.h_fc5b_advantage - tf.reduce_mean(self.h_fc5b_advantage, reduction_indices=[1,], keep_dims=True) )


        return self.h_fc6

3. 回放记忆

   构建经验回放缓存,这用于保存所有智能体的经验。从回放缓存中采样小批量的经验来训练网络:

class ReplayMemoryFast:


    # first we define init method and initialize buffer size 初始化缓存大小
    def __init__(self, memory_size, minibatch_size):

        # max number of samples to store  保存最大的样本数
        self.memory_size = memory_size

        # mini batch size
        self.minibatch_size = minibatch_size

        self.experience = [None]*self.memory_size  
        self.current_index = 0
        self.size = 0


    # next we define the function called store for storing the experiences
    def store(self, observation, action, reward, newobservation, is_terminal):

        # store the experience as a tuple (current state, action, reward, next state, is it a terminal state(终态))
        self.experience[self.current_index] = (observation, action, reward, newobservation, is_terminal)
        self.current_index += 1

        self.size = min(self.size+1, self.memory_size)
               
        # if the index is greater than  memory then we flush the index by subtrating it with memory size
        # 如果序号超出记忆单元,则通过减去记忆单元大小来刷新序号
        if self.current_index >= self.memory_size:
            self.current_index -= self.memory_size



    # we define a function called sample for sampling the minibatch of experience
    # 定义一个sample函数来采样最小批的经验

    def sample(self):
        if self.size <  self.minibatch_size:
            return []

        # first we randomly sample some indices 随机产生一些样本编号
        samples_index  = np.floor(np.random.random((self.minibatch_size,))*self.size)

        # select the experience from the sampled index 根据样本编号选择经验
        samples = [self.experience[int(i)] for i in samples_index]

        return samples

4. 训练网络

   学习如何训练网络

class DQN(object):

    # First we define the class called DQN and initialize all varaiables in __init__ method

    def __init__(self, state_size,
                       action_size,
                       session,
                       summary_writer = None,
                       exploration_period = 1000,
                       minibatch_size = 32,
                       discount_factor = 0.99,
                       experience_replay_buffer = 10000,
                       target_qnet_update_frequency = 10000,
                       initial_exploration_epsilon = 1.0,
                       final_exploration_epsilon = 0.05,
                       reward_clipping = -1,
                        ):


        self.state_size = state_size
        self.action_size = action_size


        self.session = session
        self.exploration_period = float(exploration_period)
        self.minibatch_size = minibatch_size
        self.discount_factor = tf.constant(discount_factor)
        self.experience_replay_buffer = experience_replay_buffer
        self.summary_writer = summary_writer
        self.reward_clipping = reward_clipping


        self.target_qnet_update_frequency = target_qnet_update_frequency
        self.initial_exploration_epsilon = initial_exploration_epsilon
        self.final_exploration_epsilon = final_exploration_epsilon
        self.num_training_steps = 0


        # initialize primary DDQN by creating an instance to our QNetworkDueling class
        # 通过创建一个QNetworkDueling类的实例来初始化主对抗DQN:
        self.qnet = QNetworkDueling(self.state_size, self.action_size, "qnet")

        # similarly initialize the Target DDQN 同理,初始化目标对抗DQN
        self.target_qnet = QNetworkDueling(self.state_size, self.action_size, "target_qnet")

        # Next initialize the optimizer as a RMSPropOptimizer 接着,初始化优化器为RMSPropOptimizer
        self.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01)

        # Now, initialize experience replay buffer by creating instance to our ReplayMemoryFast class
        # 通过创建一个ReplayMemoryFast类来初始化experience_replay_buffer
        self.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)

        # Setup the computation graph
        self.create_graph()


    # Next we define the function called copy_to_target_network for copying weights from the
    # primary network to our target network 将权重从主网络复制到目标网络

    def copy_to_target_network(source_network, target_network):
        target_network_update = []
        for v_source, v_target in zip(source_network.variables(), target_network.variables()):
            # this is equivalent to target = source
            update_op = v_target.assign(v_source)
            target_network_update.append(update_op)
        return tf.group(*target_network_update)


    # Now we define the function called create graph and build our computation graph 定义create_graph函数,构建计算图
    def create_graph(self):

        # we calculate Q values and select the action that has maximum Q value
        # 计算q_value,并且选择具有最大q值的行为
        with tf.name_scope("pick_action"):

            # placeholder for state
            self.state = tf.placeholder(tf.float32, (None,)+self.state_size , name="state")

            # placeholder for q values
            self.q_values = tf.identity(self.qnet(self.state) , name="q_values")

            # placeholder for predicted actions 预测行为占位符
            self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name="predicted_actions")

            # plot as a historgram to track max q values 绘制直方图来得到最大q值
            tf.histogram_summary("Q values", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning



       # Next we calculate target future reward 计算目标未来奖励
        with tf.name_scope("estimating_future_rewards"):

            self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name="next_state")
            self.next_state_mask = tf.placeholder(tf.float32, (None,) , name="next_state_mask") # 0 for terminal states
            self.rewards = tf.placeholder(tf.float32, (None,) , name="rewards")

            self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name="next_q_values_targetqnet")


            self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name="next_q_values_qnet")
            self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)
            self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)

            self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )


            self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values



        # perform the optimization 利用RMS比例优化器执行优化
        with tf.name_scope("optimization_step"):
            self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name="action_mask")
            self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])

            # clip the errors 错误剪辑
            self.error = tf.abs(self.y - self.target_q_values)
            quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)
            linear_part = self.error - quadratic_part
            self.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )

            # optimize the gradients  优化梯度
            qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())

            for i, (grad, var) in enumerate(qnet_gradients):
                if grad is not None:
                    qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)

            self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)

        # Copy the primary network weights to the target network 将主网络的权重复制给目标网络
        with tf.name_scope("target_network_update"):
            self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)



    # We define the function called store for storing all the experience in the experience replay buffer
    # 定义store函数来保存experience_replay_buffer中的全部经验
    
    def store(self, state, action, reward, next_state, is_terminal):
        # rewards clipping 奖励剪辑
        if self.reward_clipping > 0.0:
            reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)

        self.experience_replay.store(state, action, reward, next_state, is_terminal)


    # We define a function called action for selecting actions using decaying epsilon greedy policy
    # 定义一个action函数,根据衰减epsilon贪婪策略来选择行为
    def action(self, state, training = False):

        if self.num_training_steps > self.exploration_period:
            epsilon = self.final_exploration_epsilon
        else:
            epsilon =  self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_period

        if not training:
            epsilon = 0.05

        # execute a random action with probability epsilon, or follow the QNet policy with probability 1-epsilon.
    
        if random.random() <= epsilon:
            action = random.randint(0, self.action_size-1)
        else:
            action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]

        return action


    # Now we define a function called train for training our network

    def train(self):
        # Copy the QNetwork weights to the Target QNetwork. 将主网络的权重复制给目标网络
        if self.num_training_steps == 0:
            print "Training starts..."
            self.qnet.copy_to(self.target_qnet)


        # Sample experience from replay memory 从回放记忆中采样经验
        minibatch = self.experience_replay.sample()
        if len(minibatch)==0:
            return


        # get the states, actions, rewards and next states from the minibatch
        batch_states = np.asarray( [d[0] for d in minibatch] )
        actions = [d[1] for d in minibatch]
        batch_actions = np.zeros( (self.minibatch_size, self.action_size) )
        for i in xrange(self.minibatch_size):
            batch_actions[i, actions[i]] = 1

        batch_rewards = np.asarray( [d[2] for d in minibatch] )
        batch_newstates = np.asarray( [d[3] for d in minibatch] )

        batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )


        # Perform the training operation
        scores, _, = self.session.run([self.q_values, self.qnet_optimize],
                                      { self.state: batch_states,
                                        self.next_state: batch_newstates,
                                        self.next_state_mask: batch_newstates_mask,
                                        self.rewards: batch_rewards,
                                        self.action_mask: batch_actions} )

        # 更新目标网络权重
        if self.num_training_steps % self.target_qnet_update_frequency == 0:

            self.session.run( self.hard_copy_to_target )


            # Write logs
            print 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))

            str_ = self.session.run(self.summarize, { self.state: batch_states,
                                        self.next_state: batch_newstates,
                                        self.next_state_mask: batch_newstates_mask,
                                        self.rewards: batch_rewards,
                                        self.action_mask: batch_actions})

            self.summary_writer.add_summary(str_, self.num_training_steps)


        self.num_training_steps += 1

5. 赛车游戏

import gym
import time
import logging
import os
import sys
import tensorflow as tf

初始化所有变量

ENV_NAME = 'Seaquest-v0'
TOTAL_FRAMES = 20000000
MAX_TRAINING_STEPS = 20*60*60/3 
TESTING_GAMES = 30 
MAX_TESTING_STEPS = 5*60*60/3 
TRAIN_AFTER_FRAMES = 50000
epoch_size = 50000 
MAX_NOOP_START = 30
LOG_DIR = 'logs'

初始化tensorflow会话

logger = tf.train.SummaryWriter(LOG_DIR)

# Intilaize tensorflow session
session = tf.InteractiveSession()

outdir = 'results'

构建智能体

agent = DQN(state_size=env.observation_space.shape,
            action_size=env.action_space.n,
            session=session,
            summary_writer = logger,
            exploration_period = 1000000,
            minibatch_size = 32,
            discount_factor = 0.99,
            experience_replay_buffer = 1000000,
            target_qnet_update_frequency = 20000, 
            initial_exploration_epsilon = 1.0,
            final_exploration_epsilon = 0.1,
            reward_clipping = 1.0,
            DoubleDQN = UseDoubleDQN)

保存记录

session.run(tf.initialize_all_variables())
logger.add_graph(session.graph)
saver = tf.train.Saver(tf.all_variables())

env.monitor.start(outdir+'/'+ENV_NAME,force = True, video_callable=multiples_video_schedule)

num_frames = 0
num_games = 0
current_game_frames = 0
init_no_ops = np.random.randint(MAX_NOOP_START+1)
last_time = time.time()
last_frame_count = 0.0
state = env.reset()

开始训练:

while num_frames <= TOTAL_FRAMES+1:
    if test_mode:
        env.render()

    num_frames += 1
    current_game_frames += 1

    # Select the action given the curent state    
    action = agent.action(state, training = True)

    # Perform the action on the environment, receiver reward and move to the next state 
    next_state,reward,done,_ = env.step(action)

    # store this transistion information in the experience replay buffer
    if current_game_frames >= init_no_ops:
        agent.store(state,action,reward,next_state,done)
    state = next_state

    # Train the agent
    if num_frames>=TRAIN_AFTER_FRAMES:
        agent.train()

    if done or current_game_frames > MAX_TRAINING_STEPS:
        state = env.reset()
        current_game_frames = 0
        num_games += 1
        init_no_ops = np.random.randint(MAX_NOOP_START+1)


    # Save the network's parameters after every epoch
    if num_frames % epoch_size == 0  and  num_frames > TRAIN_AFTER_FRAMES:
        saver.save(session, outdir+"/"+ENV_NAME+"/model_"+str(num_frames/1000)+"k.ckpt")
        print "epoch:  frames=",num_frames,"   games=",num_games


    # We test the performance for every two epochs
    if num_frames % (2*epoch_size) == 0  and num_frames > TRAIN_AFTER_FRAMES:
        total_reward = 0
        avg_steps = 0
        for i in xrange(TESTING_GAMES):
            state = env.reset()
            init_no_ops = np.random.randint(MAX_NOOP_START+1)
            frm = 0
            while frm < MAX_TESTING_STEPS:
                frm += 1
                env.render()
                action = agent.action(state, training = False) 

                if current_game_frames < init_no_ops:
                    action = 0

                state,reward,done,_ = env.step(action)

                total_reward += reward
                if done:
                    break

            avg_steps += frm
        avg_reward = float(total_reward)/TESTING_GAMES

        str_ = session.run( tf.scalar_summary('test reward ('+str(epoch_size/1000)+'k)', avg_reward) )
        logger.add_summary(str_, num_frames) 
        print '  --> Evaluation Average Reward: ',avg_reward,'   avg steps: ',(avg_steps/TESTING_GAMES)

        state = env.reset()

env.monitor.close()
logger.close()

https://github.com/PacktPublishing/Hands-On-Reinforcement-Learning-with-Python/tree/master/Chapter12

你可能感兴趣的:(强化学习,深度强化学习,深度强化学习,强化学习,深度学习)