DQN_tensorflow 源码解读

   最近根据课题需要在研究《Playing Atari with Deep Reinforcement Learning》这篇论文,也就是DeepMind的最原始的算法,该论文对应的开源代码很多,这里以github上的一个开源代码[https://github.com/gliese581gg/DQN_tensorflow]为例,理解深度强化学习的具体训练学习过程,代码是基于tensorflow,opencv的,本人对代码进行了详细的注释,希望对大家有所帮助

Main函数,里面定义了deep_atari类,提供接口进行训练测试,其中params字典为网络进行相应的 具体看代码配置

from database import *
from emulator import *
import tensorflow as tf
import numpy as np
import time
from ale_python_interface import ALEInterface
import cv2
from scipy import misc
import gc #garbage colloector
import thread

gc.enable()

#给网络定义参数
params = {
    'visualize' : True,
    'network_type':'nips',
    'ckpt_file':None,
    'steps_per_epoch': 50000,
    'num_epochs': 100,
    'eval_freq':50000,
    'steps_per_eval':10000,
    'copy_freq' : 10000,
    'disp_freq':10000,
    'save_interval':10000,
    'db_size': 1000000,
    'batch': 32,
    'num_act': 0,
    'input_dims' : [210, 160, 3],
    'input_dims_proc' : [84, 84, 4],
    'learning_interval': 1,
    'eps': 1.0,
    'eps_step':1000000,
    'eps_min' : 0.1,
    'eps_eval' : 0.05,
    'discount': 0.95,
    'lr': 0.0002,
    'rms_decay':0.99,
    'rms_eps':1e-6,
    'train_start':100,
    'img_scale':255.0,
    'clip_delta' : 0, #nature : 1
    'gpu_fraction' : 0.25,
    'batch_accumulator':'mean',
    'record_eval' : True,
    'only_eval' : 'n'
}

class deep_atari:
    def __init__(self,params):
        print 'Initializing Module...'
        self.params = params

        self.gpu_config = tf.ConfigProto(gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.params['gpu_fraction']))

        self.sess = tf.Session(config=self.gpu_config)
        self.DB = database(self.params)#初始化replay memory
        self.engine = emulator(rom_name='breakout.bin', vis=self.params['visualize'],windowname=self.params['network_type']+'_preview')
        self.params['num_act'] = len(self.engine.legal_actions)#确认该游戏,action的数量
        self.build_net()#创建网络 qnet网络 和target网络
        self.training = True

    def build_net(self):
        print 'Building QNet and targetnet...'  

        '''qnet表示训练网络,target表示测试的网络,整个模型可以理解为一个游戏的初学者,
        在玩游戏的过程中,通过已有的经验的学会怎么玩游戏,在这里已有的经验为DB(replay memory)'''  
        self.qnet = DQN(self.params,'qnet')#定义qnet
        self.targetnet = DQN(self.params,'targetnet')#定义targetnet
        self.sess.run(tf.initialize_all_variables())
        saver_dict = {'qw1':self.qnet.w1,'qb1':self.qnet.b1,
                'qw2':self.qnet.w2,'qb2':self.qnet.b2,
                'qw3':self.qnet.w3,'qb3':self.qnet.b3,
                'qw4':self.qnet.w4,'qb4':self.qnet.b4,
                'qw5':self.qnet.w5,'qb5':self.qnet.b5,
                'tw1':self.targetnet.w1,'tb1':self.targetnet.b1,
                'tw2':self.targetnet.w2,'tb2':self.targetnet.b2,
                'tw3':self.targetnet.w3,'tb3':self.targetnet.b3,
                'tw4':self.targetnet.w4,'tb4':self.targetnet.b4,
                'tw5':self.targetnet.w5,'tb5':self.targetnet.b5,
                'step':self.qnet.global_step}#需要保存的save_dict的权值和偏置值
        self.saver = tf.train.Saver(saver_dict)
        #self.saver = tf.train.Saver()
        #复制qnet网络的权值和偏置值
        self.cp_ops = [
            self.targetnet.w1.assign(self.qnet.w1),self.targetnet.b1.assign(self.qnet.b1),
            self.targetnet.w2.assign(self.qnet.w2),self.targetnet.b2.assign(self.qnet.b2),
            self.targetnet.w3.assign(self.qnet.w3),self.targetnet.b3.assign(self.qnet.b3),
            self.targetnet.w4.assign(self.qnet.w4),self.targetnet.b4.assign(self.qnet.b4),
            self.targetnet.w5.assign(self.qnet.w5),self.targetnet.b5.assign(self.qnet.b5)]

        self.sess.run(self.cp_ops)

        if self.params['ckpt_file'] is not None:#恢复上一次训练的状态
            print 'loading checkpoint : ' + self.params['ckpt_file']
            self.saver.restore(self.sess,self.params['ckpt_file'])
            temp_train_cnt = self.sess.run(self.qnet.global_step)
            temp_step = temp_train_cnt * self.params['learning_interval']
            print 'Continue from'
            print '        -> Steps : ' + str(temp_step)
            print '        -> Minibatch update : ' + str(temp_train_cnt)


    def start(self):#网络开始学习和训练
        self.reset_game()#开始一个新的游戏
        self.step = 0#当前迭代次数
        self.reset_statistics('all')#重置网络所有参数
        self.train_cnt = self.sess.run(self.qnet.global_step)

        #如果是恢复上一层训练的状态则读取相应的文件
        if self.train_cnt > 0 :
            self.step = self.train_cnt * self.params['learning_interval']
            try:
                self.log_train = open('log_training_'+self.params['network_type']+'.csv','a')
            except:
                self.log_train = open('log_training_'+self.params['network_type']+'.csv','w')
                self.log_train.write('step,epoch,train_cnt,avg_reward,avg_q,epsilon,time\n')    

            try:
                self.log_eval = open('log_eval_'+self.params['network_type']+'.csv','a')
            except:
                self.log_eval = open('log_eval_'+self.params['network_type']+'.csv','w')
                self.log_eval.write('step,epoch,train_cnt,avg_reward,avg_q,epsilon,time\n')
        else:
                self.log_train = open('log_training_'+self.params['network_type']+'.csv','w')
                self.log_train.write('step,epoch,train_cnt,avg_reward,avg_q,epsilon,time\n')    
                self.log_eval = open('log_eval_'+self.params['network_type']+'.csv','w')
                self.log_eval.write('step,epoch,train_cnt,avg_reward,avg_q,epsilon,time\n')

        self.s = time.time()
        #输出网络状态
        print self.params
        print 'Start training!'
        print 'Collecting replay memory for ' + str(self.params['train_start']) + ' steps'

        #开始进行迭代,训练,其中,params['train_start']表示用随机的权值跑游戏,获取最初始的replay memory
        while self.step < (self.params['steps_per_epoch'] * self.params['num_epochs'] * self.params['learning_interval'] + self.params['train_start']): 
            if self.training : 
                if self.DB.get_size() >= self.params['train_start'] : self.step += 1 ; self.steps_train += 1
            else : self.step_eval += 1

            #将上一次的状态,归一化之后的奖励值,以及所采取的action的索引,布尔类型的terminal 保存到DB里面(replay memory)
            if self.state_gray_old is not None and self.training:
                self.DB.insert(self.state_gray_old[26:110,:],self.reward_scaled,self.action_idx,self.terminal)

            #每隔params['copy_freq']的迭代次数,将训练的qnet网络超参数,复制到target网络
            if self.training and self.params['copy_freq'] > 0 and self.step % self.params['copy_freq'] == 0 and self.DB.get_size() > self.params['train_start']:
                print '&&& Copying Qnet to targetnet\n'
                self.sess.run(self.cp_ops)#???

            #每隔params['learning_interval']的迭代次数权值更新一次,注意的是params['learning_interval']=1表明每次action之后都得训练一次
            if self.training and self.step % self.params['learning_interval'] == 0 and self.DB.get_size() > self.params['train_start'] :

                '''从DB(replay memory中)随机选取batch个状态序列,供网络进行学习训练,
                具体的数据有状态s,动作a对应的索引,采取动作a之后的下一个状态,以及奖励值'''
                bat_s,bat_a,bat_t,bat_n,bat_r = self.DB.get_batches()
                bat_a = self.get_onehot(bat_a)#将action的索引值转换成一个稀疏矩阵,矩阵的行的大小表示batch_size,列的大小表示num_action,每行中对应的bat_a为1,其余为0

                #将游戏的当前状态,通过targetnet,将输出的q_t当作当前状态下的最大未来奖励
                if self.params['copy_freq'] > 0 :
                    feed_dict={self.targetnet.x: bat_n}
                    q_t = self.sess.run(self.targetnet.y,feed_dict=feed_dict)
                else:
                    feed_dict={self.qnet.x: bat_n}
                    q_t = self.sess.run(self.qnet.y,feed_dict=feed_dict)

                q_t = np.amax(q_t,axis=1)

                #这里将随机取出来的状态序列(可以理解成为经验),喂入qnet网络
                feed_dict={self.qnet.x: bat_s, self.qnet.q_t: q_t, self.qnet.actions: bat_a, self.qnet.terminals:bat_t, self.qnet.rewards: bat_r}

                #通过之前定义的qnet计算损失函数
                _,self.train_cnt,self.cost = self.sess.run([self.qnet.rmsprop,self.qnet.global_step,self.qnet.cost],feed_dict=feed_dict)

                #累计损失函数的计算
                self.total_cost_train += np.sqrt(self.cost)
                self.train_cnt_for_disp += 1

            if self.training :              
                self.params['eps'] = max(self.params['eps_min'],1.0 - float(self.train_cnt * self.params['learning_interval'])/float(self.params['eps_step']))
            else:
                self.params['eps'] = 0.05

            #每隔self.params['save_interval']迭代,进行权值保存 相当于caffe里面的snapshot    ,一下到228行都是一些辅助代码,显示输出啊,测试啊,等等,
            if self.DB.get_size() > self.params['train_start'] and self.step % self.params['save_interval'] == 0 and self.training:
                save_idx = self.train_cnt
                self.saver.save(self.sess,'ckpt/model_'+self.params['network_type']+'_'+str(save_idx))
                sys.stdout.write('$$$ Model saved : %s\n\n' % ('ckpt/model_'+self.params['network_type']+'_'+str(save_idx)))
                sys.stdout.flush()
            #输出显示
            if self.training and self.step > 0 and self.step % self.params['disp_freq']  == 0 and self.DB.get_size() > self.params['train_start'] : 
                self.write_log_train()

            #进行测试,这个时候相当于,只是用targetnet玩游戏,测试嘛。。qnet就被搁置了
            if self.training and self.step > 0 and self.step % self.params['eval_freq'] == 0 and self.DB.get_size() > self.params['train_start'] : 

                self.reset_game()
                if self.step % self.params['steps_per_epoch'] == 0 : self.reset_statistics('all')
                else: self.reset_statistics('eval')
                self.training = False
                #TODO : add video recording             
                continue

            #训练时,每self.params['steps_per_epoch']步,重新开始游戏,因为往后对权值更新的作用不大
            #这里为解释一下,为什么会有这玩意,因为深度强化学习采用的是贝尔曼迭代法,有一个咖马参数,当当前游戏玩的步数比较多时,对当前的最大未来奖励就不是很大了,具体见论文
            if self.training and self.step > 0 and self.step % self.params['steps_per_epoch'] == 0 and self.DB.get_size() > self.params['train_start']: 
                self.reset_game()
                self.reset_statistics('all')
                #self.training = False
                continue

            if not self.training and self.step_eval >= self.params['steps_per_eval'] :
                self.write_log_eval()
                self.reset_game()
                self.reset_statistics('eval')
                self.training = True
                continue

            #判断游戏是否over
            if self.terminal :  
                self.reset_game()
                if self.training : 
                    self.num_epi_train += 1 
                    self.total_reward_train += self.epi_reward_train
                    self.epi_reward_train = 0
                else : 
                    self.num_epi_eval += 1 
                    self.total_reward_eval += self.epi_reward_eval
                    self.epi_reward_eval = 0
                continue
            '''这个函数表示选择下一步执行的action,select_action()这个函数采用了模拟退火算法的思想'''
            self.action_idx,self.action, self.maxQ = self.select_action(self.state_proc)

            #根据select_action函数获得的动作 执行,并且返回执行该动作后,所产生的状态,奖励,是否中断游戏等参数(构成马尔可夫链的一个新节点)
            self.state, self.reward, self.terminal = self.engine.next(self.action)
            self.reward_scaled = self.reward // max(1,abs(self.reward))#归一化奖励
            if self.training : self.epi_reward_train += self.reward ; self.total_Q_train += self.maxQ#累计奖励
            else : self.epi_reward_eval += self.reward ; self.total_Q_eval += self.maxQ 

            #以下程序段对新产生的状态进行处理,便于归入DB(replay memory,)
            self.state_gray_old = np.copy(self.state_gray)
            self.state_proc[:,:,0:3] = self.state_proc[:,:,1:4]
            self.state_resized = cv2.resize(self.state,(84,110))
            self.state_gray = cv2.cvtColor(self.state_resized, cv2.COLOR_BGR2GRAY)
            self.state_proc[:,:,3] = self.state_gray[26:110,:]/self.params['img_scale']

            #TODO : add video recording

    def reset_game(self):
        self.state_proc = np.zeros((84,84,4)); self.action = -1; self.terminal = False; self.reward = 0
        self.state = self.engine.newGame()      
        self.state_resized = cv2.resize(self.state,(84,110))
        self.state_gray = cv2.cvtColor(self.state_resized, cv2.COLOR_BGR2GRAY)
        self.state_gray_old = None
        self.state_proc[:,:,3] = self.state_gray[26:110,:]/self.params['img_scale']

    def reset_statistics(self,mode):
        if mode == 'all':
            self.epi_reward_train = 0
            self.epi_Q_train = 0
            self.num_epi_train = 0
            self.total_reward_train = 0
            self.total_Q_train = 0
            self.total_cost_train = 0
            self.steps_train = 0
            self.train_cnt_for_disp = 0
        self.step_eval = 0
        self.epi_reward_eval = 0
        self.epi_Q_eval = 0     
        self.num_epi_eval = 0       
        self.total_reward_eval = 0
        self.total_Q_eval = 0


    def write_log_train(self):
        sys.stdout.write('### Training (Step : %d , Minibatch update : %d , Epoch %d)\n' % (self.step,self.train_cnt,self.step//self.params['steps_per_epoch'] ))

        sys.stdout.write('    Num.Episodes : %d , Avg.reward : %.3f , Avg.Q : %.3f, Avg.loss : %.3f\n' % (self.num_epi_train,float(self.total_reward_train)/max(1,self.num_epi_train),float(self.total_Q_train)/max(1,self.steps_train),self.total_cost_train/max(1,self.train_cnt_for_disp)))
        sys.stdout.write('    Epsilon : %.3f , Elapsed time : %.1f\n\n' % (self.params['eps'],time.time()-self.s))
        sys.stdout.flush()
        self.log_train.write(str(self.step) + ',' + str(self.step//self.params['steps_per_epoch']) + ',' + str(self.train_cnt) + ',')
        self.log_train.write(str(float(self.total_reward_train)/max(1,self.num_epi_train)) +','+ str(float(self.total_Q_train)/max(1,self.steps_train)) +',')
        self.log_train.write(str(self.params['eps']) +','+ str(time.time()-self.s) + '\n')
        self.log_train.flush()      

    def write_log_eval(self):
        sys.stdout.write('@@@ Evaluation (Step : %d , Minibatch update : %d , Epoch %d)\n' % (self.step,self.train_cnt,self.step//self.params['steps_per_epoch'] ))
        sys.stdout.write('    Num.Episodes : %d , Avg.reward : %.3f , Avg.Q : %.3f\n' % (self.num_epi_eval,float(self.total_reward_eval)/max(1,self.num_epi_eval),float(self.total_Q_eval)/max(1,self.params['steps_per_eval'])))
        sys.stdout.write('    Epsilon : %.3f , Elapsed time : %.1f\n\n' % (self.params['eps'],time.time()-self.s))
        sys.stdout.flush()
        self.log_eval.write(str(self.step) + ',' + str(self.step//self.params['steps_per_epoch']) + ',' + str(self.train_cnt) + ',')
        self.log_eval.write(str(float(self.total_reward_eval)/max(1,self.num_epi_eval)) +','+ str(float(self.total_Q_eval)/max(1,self.params['steps_per_eval'])) +',')
        self.log_eval.write(str(self.params['eps']) +','+ str(time.time()-self.s) + '\n')
        self.log_eval.flush()

    def select_action(self,st):
        if np.random.rand() > self.params['eps']:#产生随机数,若self.params['eps']小于该随机数,则用qnet决定下一步执行的动作,注意这里采用的是qnet
            #greedy with random tie-breaking
            Q_pred = self.sess.run(self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0] 
            a_winner = np.argwhere(Q_pred == np.amax(Q_pred))
            if len(a_winner) > 1:
                act_idx = a_winner[np.random.randint(0, len(a_winner))][0]
                return act_idx,self.engine.legal_actions[act_idx], np.amax(Q_pred)
            else:
                act_idx = a_winner[0][0]
                return act_idx,self.engine.legal_actions[act_idx], np.amax(Q_pred)
        #若小于该随机数则随机产生一个动作进行执行
        else:
            #random
            act_idx = np.random.randint(0,len(self.engine.legal_actions))
            Q_pred = self.sess.run(self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0]
            return act_idx,self.engine.legal_actions[act_idx], Q_pred[act_idx]

    def get_onehot(self,actions):
        actions_onehot = np.zeros((self.params['batch'], self.params['num_act']))

        for i in range(self.params['batch']):
            actions_onehot[i,actions[i]] = 1
        return actions_onehot


if __name__ == "__main__":
    dict_items = params.items()
    for i in range(1,len(sys.argv),2):
        if sys.argv[i] == '-weight' :params['ckpt_file'] = sys.argv[i+1]
        elif sys.argv[i] == '-network_type' :params['network_type'] = sys.argv[i+1]
        elif sys.argv[i] == '-visualize' :
            if sys.argv[i+1] == 'y' : params['visualize'] = True
            elif sys.argv[i+1] == 'n' : params['visualize'] = False
            else:
                print 'Invalid visualization argument!!! Available arguments are'
                print '        y or n'
                raise ValueError()
        elif sys.argv[i] == '-gpu_fraction' : params['gpu_fraction'] = float(sys.argv[i+1])
        elif sys.argv[i] == '-db_size' : params['db_size'] = int(sys.argv[i+1])
        elif sys.argv[i] == '-only_eval' : params['only_eval'] = sys.argv[i+1]
        else : 
            print 'Invalid arguments!!! Available arguments are'
            print '        -weight (filename)'
            print '        -network_type (nips or nature)'
            print '        -visualize (y or n)'
            print '        -gpu_fraction (0.1~0.9)'
            print '        -db_size (integer)'
            raise ValueError()
    if params['network_type'] == 'nips':
        from DQN_nips import *
    elif params['network_type'] == 'nature':
        from DQN_nature import *
        params['steps_per_epoch']= 200000
        params['eval_freq'] = 100000
        params['steps_per_eval'] = 10000
        params['copy_freq'] = 10000
        params['disp_freq'] = 20000
        params['save_interval'] = 20000
        params['learning_interval'] = 1
        params['discount'] = 0.99
        params['lr'] = 0.00025
        params['rms_decay'] = 0.95
        params['rms_eps']=0.01
        params['clip_delta'] = 1.0
        params['train_start']=50000
        params['batch_accumulator'] = 'sum'
        params['eps_step'] = 1000000
        params['num_epochs'] = 250
        params['batch'] = 32
    else :
        print 'Invalid network type! Available network types are'
        print '        nips or nature'
        raise ValueError()

    if params['only_eval'] == 'y' : only_eval = True
    elif params['only_eval'] == 'n' : only_eval = False
    else :
        print 'Invalid only_eval option! Available options are'
        print '        y or n'
        raise ValueError()

    if only_eval:
        params['eval_freq'] = 1
        params['train_start'] = 100

    da = deep_atari(params)
    da.start()

database类 该类实现了论文里面的replay memory

import numpy as np
import gc
import time
import cv2

class database:
    def __init__(self, params):
        self.size = params['db_size']
        self.img_scale = params['img_scale']
        self.states = np.zeros([self.size,84,84],dtype='uint8') #image dimensions
        self.actions = np.zeros(self.size,dtype='float32')
        self.terminals = np.zeros(self.size,dtype='float32')
        self.rewards = np.zeros(self.size,dtype='float32')
        self.bat_size = params['batch']
        self.bat_s = np.zeros([self.bat_size,84,84,4])
        self.bat_a = np.zeros([self.bat_size])
        self.bat_t = np.zeros([self.bat_size])
        self.bat_n = np.zeros([self.bat_size,84,84,4])
        self.bat_r = np.zeros([self.bat_size])

        self.counter = 0 #keep track of next empty state
        self.flag = False
        return

    def get_batches(self):#get random replay memory     
        for i in range(self.bat_size):#从replay memory提取,batch_size=32的序列数据
            idx = 0
            while idx < 3 or (idx > self.counter-2 and idx < self.counter+3):
                idx = np.random.randint(3,self.get_size()-1)#get_size()返回的是当前replay memory的状态个数,随机选取
            #以下是提取相应idx对应的值,并返回 
            self.bat_s[i] = np.transpose(self.states[idx-3:idx+1,:,:],(1,2,0))/self.img_scale
            self.bat_n[i] = np.transpose(self.states[idx-2:idx+2,:,:],(1,2,0))/self.img_scale
            self.bat_a[i] = self.actions[idx]
            self.bat_t[i] = self.terminals[idx]
            self.bat_r[i] = self.rewards[idx]
        #self.bat_s[0] = np.transpose(self.states[10:14,:,:],(1,2,0))/self.img_scale
        #self.bat_n[0] = np.transpose(self.states[11:15,:,:],(1,2,0))/self.img_scale
        #self.bat_a[0] = self.actions[13]
        #self.bat_t[0] = self.terminals[13]
        #self.bat_r[0] = self.rewards[13]

        return self.bat_s,self.bat_a,self.bat_t,self.bat_n,self.bat_r

    def insert(self, prevstate_proc,reward,action,terminal):#更新马尔可夫链
        self.states[self.counter] = prevstate_proc
        self.rewards[self.counter] = reward
        self.actions[self.counter] = action
        self.terminals[self.counter] = terminal
        #update counter
        self.counter += 1
        if self.counter >= self.size:
            self.flag = True
            self.counter = 0
        return

    def get_size(self):#返回当前replay 马尔可夫链的大小
        if self.flag == False:
            return self.counter
        else:
            return self.size

DQN网络,该代码的核心部分,定义了网络结构,贝尔曼函数以及损失函数

import numpy as np
import tensorflow as tf
import cv2

class DQN:
    def __init__(self,params,name):
        # 用tensorflow为马尔可夫节点的各个元素分配内存空间,输入[32,84,84,4](注:params['batch']=32)
        self.network_type = 'nature'
        self.params = params
        self.network_name = name
        self.x = tf.placeholder('float32',[None,84,84,4],name=self.network_name + '_x')
        self.q_t = tf.placeholder('float32',[None],name=self.network_name + '_q_t')
            self.actions = tf.placeholder("float32", [None, params['num_act']],name=self.network_name + '_actions')
        self.rewards = tf.placeholder("float32", [None],name=self.network_name + '_rewards')
        self.terminals = tf.placeholder("float32", [None],name=self.network_name + '_terminals')

        #conv1,[32,84,84,4]-->[32,w1,h1,32](卷积后的w,h,根据公式:h=(h+2*padding-stride)/stride+1,自行计算)
        layer_name = 'conv1' ; size = 8 ; channels = 4 ; filters = 32 ; stride = 4
        self.w1 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
        self.b1 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases')
        self.c1 = tf.nn.conv2d(self.x, self.w1, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs')
        self.o1 = tf.nn.relu(tf.add(self.c1,self.b1),name=self.network_name + '_'+layer_name+'_activations')
        #self.n1 = tf.nn.lrn(self.o1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)

        #conv2,[32,w1,h1,32]-->[32,w2,h2,64]
        layer_name = 'conv2' ; size = 4 ; channels = 32 ; filters = 64 ; stride = 2
        self.w2 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
        self.b2 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases')
        self.c2 = tf.nn.conv2d(self.o1, self.w2, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs')
        self.o2 = tf.nn.relu(tf.add(self.c2,self.b2),name=self.network_name + '_'+layer_name+'_activations')
        #self.n2 = tf.nn.lrn(self.o2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)

        #conv3,[32,w2,h2,64]-->[32,w3,h3,64]
        layer_name = 'conv3' ; size = 3 ; channels = 64 ; filters = 64 ; stride = 1
        self.w3 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
        self.b3 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases')
        self.c3 = tf.nn.conv2d(self.o2, self.w3, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs')
        self.o3 = tf.nn.relu(tf.add(self.c3,self.b3),name=self.network_name + '_'+layer_name+'_activations')
        #self.n2 = tf.nn.lrn(self.o2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)

        #flat,将矩阵压缩成向量
        o3_shape = self.o3.get_shape().as_list()        

        #fc3,[32,w3*h3*64]-->[32,512]
        layer_name = 'fc4' ; hiddens = 512 ; dim = o3_shape[1]*o3_shape[2]*o3_shape[3]
        self.o3_flat = tf.reshape(self.o3, [-1,dim],name=self.network_name + '_'+layer_name+'_input_flat')
        self.w4 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
        self.b4 = tf.Variable(tf.constant(0.1, shape=[hiddens]),name=self.network_name + '_'+layer_name+'_biases')
        self.ip4 = tf.add(tf.matmul(self.o3_flat,self.w4),self.b4,name=self.network_name + '_'+layer_name+'_ips')
        self.o4 = tf.nn.relu(self.ip4,name=self.network_name + '_'+layer_name+'_activations')

        #fc4,[32,512]-->[32,number_act]
        layer_name = 'fc5' ; hiddens = params['num_act'] ; dim = 512
        self.w5 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
        self.b5 = tf.Variable(tf.constant(0.1, shape=[hiddens]),name=self.network_name + '_'+layer_name+'_biases')
        ''' 在这里说一说,deep-Q-Learning的核心思想之一,就是在玩游戏时,使下一步的最大未来奖励最大,但是在实际过程,
        不可能从未来向现在求累加,于是就用了贝尔曼迭代公式,就是假设下一个状态的最大未来奖励已经求解出来了,
        这样就可以通过target神经网络求出该状态下的最大未来奖励(这个值做为qnet神经网络训练样本的期望值,但是,把它叫做label),
        这种思想,类似于算法里面的递归,数学里面的归纳法,好了,现在再来说说深度神经网络的作用,它的输入就是当前状态(是一个连续的4张图像),
        输出是下一状态下,不同action对应的最大未来收益,选取最大的输出值,就是该状态下的最大未来收益值,也就是神经网络的预测值,这里有意思的是期望值和预测值都是通过神经网络
        求出来的(两个神经网络),训练的过程就和正常卷积神经网络的也就一样了,选择损失函数,误差反向传递,更新权值'''
        self.y = tf.add(tf.matmul(self.o4,self.w5),self.b5,name=self.network_name + '_'+layer_name+'_outputs')

        #Q,Cost,Optimizer
        self.discount = tf.constant(self.params['discount'])#贝尔曼迭代公式的咖马

        #贝尔曼迭代公式,计算出来的结果叫做打折后的的最大未来奖励
        self.yj = tf.add(self.rewards, tf.mul(1.0-self.terminals, tf.mul(self.discount, self.q_t)))
        self.Qxa = tf.mul(self.y,self.actions)
        self.Q_pred = tf.reduce_max(self.Qxa, reduction_indices=1)
        #self.yjr = tf.reshape(self.yj,(-1,1))
        #self.yjtile = tf.concat(1,[self.yjr,self.yjr,self.yjr,self.yjr])
        #self.yjax = tf.mul(self.yjtile,self.actions)

        #half = tf.constant(0.5)
        self.diff = tf.sub(self.yj, self.Q_pred)
        if self.params['clip_delta'] > 0 :
            self.quadratic_part = tf.minimum(tf.abs(self.diff), tf.constant(self.params['clip_delta']))###?????
            self.linear_part = tf.sub(tf.abs(self.diff),self.quadratic_part)
            self.diff_square = 0.5 * tf.pow(self.quadratic_part,2) + self.params['clip_delta']*self.linear_part


        else:
            self.diff_square = tf.mul(tf.constant(0.5),tf.pow(self.diff, 2))

        if self.params['batch_accumulator'] == 'sum':
            self.cost = tf.reduce_sum(self.diff_square)
        else:
            self.cost = tf.reduce_mean(self.diff_square)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.rmsprop = tf.train.RMSPropOptimizer(self.params['lr'],self.params['rms_decay'],0.0,self.params['rms_eps']).minimize(self.cost,global_step=self.global_step)

emulator类 定义了atari的游戏接口,可以获取当前的状态(图像),reward,重置游戏函数,新建游戏函数

import numpy as np
import copy
import sys
from ale_python_interface import ALEInterface
import cv2
import time
#import scipy.misc

class emulator:
    def __init__(self, rom_name, vis,windowname='preview'):
        self.ale = ALEInterface()
        self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode");
        self.ale.setInt("random_seed",123)
        self.ale.setInt("frame_skip",4)
        self.ale.loadROM('roms/' + rom_name )
        self.legal_actions = self.ale.getMinimalActionSet()
        self.action_map = dict()
        self.windowname = windowname
        for i in range(len(self.legal_actions)):
            self.action_map[self.legal_actions[i]] = i

        # print(self.legal_actions)
        self.screen_width,self.screen_height = self.ale.getScreenDims()
        print("width/height: " +str(self.screen_width) + "/" + str(self.screen_height))
        self.vis = vis
        if vis: 
            cv2.startWindowThread()
            cv2.namedWindow(self.windowname)

    def get_image(self):#读取画面的图像
        numpy_surface = np.zeros(self.screen_height*self.screen_width*3, dtype=np.uint8)
        self.ale.getScreenRGB(numpy_surface)
        image = np.reshape(numpy_surface, (self.screen_height, self.screen_width, 3))
        return image

    def newGame(self):
        self.ale.reset_game()#开始新的游戏
        return self.get_image()#得到初始的游戏画面

    def next(self, action_indx):
        reward = self.ale.act(action_indx)  
        nextstate = self.get_image()
        # scipy.misc.imsave('test.png',nextstate)
        if self.vis:
            cv2.imshow(self.windowname,nextstate)
        return nextstate, reward, self.ale.game_over()



if __name__ == "__main__":
    engine = emulator('breakout.bin',True)
    engine.next(0)
    time.sleep(5)

论文链接:https://arxiv.org/pdf/1312.5602.pdf
代码链接:https://github.com/gliese581gg/DQN_tensorflow

晚点附上,这篇论文的论文笔记

若存在不足之处,请批评指正

你可能感兴趣的:(深度学习与计算机视觉)