dqn_bird.py
# 经验池类
# 该类用于经验数据的存储和训练数据的采集
from __future__ import print_function
import tensorflow as tf
import numpy as np
import cv2 # 图像处理库
import sys # 系统控制库
sys.path.append("game/")
import game.wrapped_flappy_bird as game# 游戏模块
import random
# 设置与本算法相关的超参数
GAME = 'flappy bird' # 游戏名
ACTIONS = 2 # “飞行”和“什么都不做”
GAMMA = 0.99 # 折扣因子
OBSERVE = 10000 # 训练前观察的步长,在这段时间内探索率不变,已得到各种情况
EXPLORE = 3.0e6 # 随机探索的时间,即从初始探索率衰减到最终探索率的时间设置为30万步,在这段时间内探索率线性减小
FINAL_EPSILON = 1.0e-4 # 最终的探索率
INITIAL_EPSILON = 0.1 # 初始探索率
REPLAY_MEMORY = 50000 # 经验池的大小,即经验池中有50000个可以用于采样学习的数据
BATCH = 32 # mini-batch的大小,即在学习训练的时候,从经验池中随机采集32个数据进行训练
FRAME_PER_ACTION = 1 # 跳帧
# 定义经验回报类
class Experience_Buffer():
def __init__(self,buffer_size = REPLAY_MEMORY):
# 定义一个空的经验池
self.buffer = []
# 定义经验池最大容量
self.buffer_size = buffer_size
# 向经验池添加一条经验数据
def add_experience(self,experience):
# 先判断经验池是否已经满了
# 如果满了,将最顶端的数据清空,换成最新的经验数据
if len(self.buffer)+len(experience) >= self.buffer_size:
self.buffer[0:len(self.buffer)+len(experience)-self.buffer_size] = []
self.buffer.extend(experience)
# 采集训练数据样本
def sample(self,samples_num):
# 随机采样mini-batch的数据
# 然后将数据进行整理,返回训练时所需要的数据格式
sample_data = random.sample(self.buffer,samples_num)
train_s = [d[0] for d in sample_data]
train_a = [d[1] for d in sample_data]
train_r = [d[2] for d in sample_data]
train_s_ = [d[3] for d in sample_data]
train_terminal = [d[4] for d in sample_data]
return train_s,train_a,train_r,train_s_,train_terminal
# 深度q学习类
# 该类中定义DQN学习算法,对小鸟进行训练
class Deep_Q_N():
# 初始类成员函数
# 该函数内我们调用Tensorflow,声明一个图,定义输入层,调用类成员子函数创建行为-值网络、目标值网络,定义目标值网络的更新方式,定义损失函数,构建优化器。
# 初始化图中变量,保存声明
def __init__(self,lr=1.0e-6,model_file=None):
self.gamma = GAMMA
self.tau = 0.01
# tf工程
self.sess = tf.Session()
self.learning_rate = lr
# 1.输入层
self.obs = tf.placeholder(tf.float32,shape = [None,80,80,4])
self.obs_ = tf.placeholder(tf.float32,shape= [None,80,80,4])
self.action = tf.placeholder(tf.float32,shape=[None,ACTIONS])
self.action_ = tf.placeholder(tf.float32,shape=[None,ACTIONS])
# 2.1 创建深度q网络
self.Q = self.build_q_net(self.obs,scope='eval',trainable=True)
# 2.2 创建目标q网络
self.Q_ = self.build_q_net(self.obs_,scope='target',trainable=False)
# 2.3 整理两套网络参数
self.qe_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='eval')# 获取训练变量
self.qt_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='target')
# 2.4 定义新旧参数的替换操作
self.update_oldq_op = [oldq.assign((1-self.tau)*oldq+self.tau*p) for p,oldq in zip(self.qe_params,self.qt_params)]
# 3.构建损失函数
# td target
self.Q_target = tf.placeholder(tf.float32,[None])
readout_q = tf.reduce_sum(tf.multiply(self.Q,self.action),reduction_indices=1)
self.q_loss = tf.losses.mean_squared_error(labels=self.Q_target,predictions = readout_q)
# 4.定义优化器
self.q_train_op = tf.train.AdamOptimizer(lr).minimize(self.q_loss,var_list=self.qe_params)
# 5.初始化图中的变量
self.sess.run(tf.global_variables_initializer())
# 定义保存和恢复模型
self.saver = tf.train.Saver()
if model_file is not None:
self.restore_model(model_file)
# 模型存储子函数save_model,用于存储模型参数
def save_model(self,model_path,global_step):
self.saver.save(self.sess,model_path,global_step=global_step)
# 模型恢复子函数restore_model,用于恢复模型参数
def restore_model(self,model_path):
self.saver.restore(self.sess,model_path)
# 深度q网络构建子函数build_q_net,
# 输入为观测、变量命名空间scope和变量性质trainable
# 该子函数在初始化成员函数中被调用,由于预测用的行为-值函数和用于目标的行为-值函数是两套参数,所以可以通过使用不同的命名空间scope来区分两组参数
def build_q_net(self,obs,scope,trainable):
# 该深度网络由3个卷积层,1个池化层,2个全连接层组成
# 第一个卷积层的卷积核大小为8*8*4*32,步长为4;后面连接一个池化层,池化层特征2*2,步长为2
# 第二个卷积层的卷积核大小为4*4*32*64,步长为2;后面连一个卷积层
# 第三个卷积层的卷积核大小为3*3*64*64,步长为1,将第三个卷积层的输出展开成维数为1600的1维向量,后面接两个全连接层
# 第一个全连接层为1600*512,激活函数为ReLU
# 第二个全连接层为512*2,没有激活函数,即线性输出
with tf.variable_scope(scope):
#第一个卷积层的卷积核大小为8 * 8 * 4 * 32,步长为4;
h_conv1 = tf.layers.conv2d(inputs=obs, filters=32, kernel_size=[8, 8], strides=4, padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01), \
bias_initializer=tf.constant_initializer(0.01), trainable=trainable)
#后面连接一个池化层,池化层特征2 * 2,步长为2
h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2],strides=2, padding="SAME")
#第二个卷积层的卷积核大小为4 * 4 * 32 * 64,步长为2;
h_conv2 = tf.layers.conv2d(inputs=h_pool1, filters=64, kernel_size=[4, 4], strides=2, padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01), \
bias_initializer=tf.constant_initializer(0.01), trainable=trainable)
# 第三个卷积层的卷积核大小为3*3*64*64,步长为1,
h_conv3 = tf.layers.conv2d(inputs=h_conv2, filters=64, kernel_size=[3, 3], strides=1, padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01), \
bias_initializer=tf.constant_initializer(0.01), trainable=trainable)
# 将第三个卷积层的输出展开成维数为1600的1维向量,
h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
# 第一个全连接层为1600 * 512,激活函数为ReLU
h_fc1 = tf.layers.dense(inputs=h_conv3_flat,
units=512,
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0,stddev=0.1),
bias_initializer=tf.constant_initializer(0.1),
trainable=trainable)
# 第二个全连接层为512*2,没有激活函数,即线性输出
# 读出层,没有激活函数
qout = tf.layers.dense(inputs=h_fc1,
units=ACTIONS,
kernel_initializer=tf.random_normal_initializer(0,stddev=0.1),
bias_initializer=tf.constant_initializer(0.1),
trainable=trainable)
return qout
# 用于采样动作的利用探索-平衡策略子函数epsilon-greedy
# 该算法输入为当前状态和探索率,输出为当前状态所对应的探索策略,用于于环境交互
# 与表格型Q-learning不同的是,这里调用神经网络来确定哪个是最优动作
def epsilon_greedy(self,s_t,epsilon):
a_t = np.zeros([ACTIONS])
amax = np.argmax(self.sess.run(self.Q,{self.obs:[s_t]})[0])
# 概率部分
if np.random.uniform()<1-epsilon:
# 最优动作
a_t[amax] = 1
else:
a_t[random.randrange(ACTIONS)] = 1
return a_t
# 网络训练子函数train_Network
# 该子函数基于Q-learning的框架,基于网络表示的行为值函数对智能体进行训练
# 与环境交互,将数据存入经验池中,从经验池中采集数据对神经网络进行训练
def train_Network(self,experience_buffer):
# 打开游戏状态与模拟器进行通信
game_state = game.GameState()
# 获得第1个状态并将图像进行预处理
do_nothing = np.zeros([ACTIONS])
do_nothing[0] = 1
# 与游戏交互1次
x_t,r_0,terminal = game_state.frame_step(do_nothing)
x_t = cv2.cvtColor(cv2.resize(x_t,(80,80)),cv2.COLOR_BGR2GRAY)
ret,x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
s_t = np.stack((x_t,x_t,x_t,x_t),axis=2)
# 开始训练
epsilon = INITIAL_EPSILON
t = 0
while "flappy bird"!="angry bird":
a_t = self.epsilon_greedy(s_t,epsilon=epsilon)
# epsilon递减
if epsilon > FINAL_EPSILON and t > OBSERVE:
epsilon -= (INITIAL_EPSILON-FINAL_EPSILON)/EXPLORE
# 运动动作,与游戏环境交互1次
x_t1_colored,r_t,terminal = game_state.frame_step(a_t)
x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored,(80,80)),cv2.COLOR_BGR2GRAY)
ret,x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
x_t1 = np.reshape(x_t1,(80,80,1))
s_t1 =np.append(x_t1,s_t[:,:,:3],axis=2)
# 将数据存储到经验池中
experience = np.reshape(np.array([s_t,a_t,r_t,s_t1,terminal]),[1,5])
print("experience", r_t, terminal)
experience_buffer.add_experience(experience)
# 在观测结束后进行训练
if t>OBSERVE:
# 采集样本
train_s,train_a,train_r,train_s_,train_terminal = experience_buffer.sample(BATCH)
target_q = []
read_target_Q = self.sess.run(self.Q_,{self.obs_:train_s_})
for i in range(len(train_r)):
if train_terminal[i]:
target_q.append(train_r[i])
else:
target_q.append(train_r[i]+GAMMA*np.max(read_target_Q[i]))
# 训练1次
self.sess.run(self.q_train_op,feed_dict={self.obs:train_s,self.action:train_a,self.Q_target:target_q})
# 更新旧的目标网络
self.sess.run(self.update_oldq_op)
# 往前推进一步
s_t = s_t1
t+=1
# 每10000次迭代保存一次
if t % 10000 == 0:
self.save_model('saved_networks/',global_step=t)
if t <= OBSERVE:
print("OBSERVE",t)
else:
if t % 1 == 0:
print("train,steps",t,"/epsilon",epsilon,"/action_index",a_t,"/reward",r_t)
# 主函数
# 对DQN进行训练
# 首先实例化1个经验池类buffer,声明一个深度值网络类brain,调用brain类的训练子函数对深度值网络进行训练
if __name__=="__main__":
buffer = Experience_Buffer()
brain = Deep_Q_N()
brain.train_Network(buffer)
wrapped_flappy_bird.py
import numpy as np
import sys
import random
import pygame
import flappy_bird_utils
import pygame.surfarray as surfarray
from pygame.locals import *
from itertools import cycle
FPS = 30
SCREENWIDTH = 288
SCREENHEIGHT = 512
pygame.init()
FPSCLOCK = pygame.time.Clock()
SCREEN = pygame.display.set_mode((SCREENWIDTH, SCREENHEIGHT))
pygame.display.set_caption('Flappy Bird')
IMAGES, SOUNDS, HITMASKS = flappy_bird_utils.load()
PIPEGAPSIZE = 100 # gap between upper and lower part of pipe
BASEY = SCREENHEIGHT * 0.79
PLAYER_WIDTH = IMAGES['player'][0].get_width()
PLAYER_HEIGHT = IMAGES['player'][0].get_height()
PIPE_WIDTH = IMAGES['pipe'][0].get_width()
PIPE_HEIGHT = IMAGES['pipe'][0].get_height()
BACKGROUND_WIDTH = IMAGES['background'].get_width()
PLAYER_INDEX_GEN = cycle([0, 1, 2, 1])
class GameState:
def __init__(self):
self.score = self.playerIndex = self.loopIter = 0
self.playerx = int(SCREENWIDTH * 0.2)
self.playery = int((SCREENHEIGHT - PLAYER_HEIGHT) / 2)
self.basex = 0
self.baseShift = IMAGES['base'].get_width() - BACKGROUND_WIDTH
newPipe1 = getRandomPipe()
newPipe2 = getRandomPipe()
self.upperPipes = [
{'x': SCREENWIDTH, 'y': newPipe1[0]['y']},
{'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[0]['y']},
]
self.lowerPipes = [
{'x': SCREENWIDTH, 'y': newPipe1[1]['y']},
{'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[1]['y']},
]
# player velocity, max velocity, downward accleration, accleration on flap
self.pipeVelX = -4
self.playerVelY = 0 # player's velocity along Y, default same as playerFlapped
self.playerMaxVelY = 10 # max vel along Y, max descend speed
self.playerMinVelY = -8 # min vel along Y, max ascend speed
self.playerAccY = 1 # players downward accleration
self.playerFlapAcc = -9 # players speed on flapping
self.playerFlapped = False # True when player flaps
def frame_step(self, input_actions):
pygame.event.pump()
reward = 0.1
terminal = False
if sum(input_actions) != 1:
raise ValueError('Multiple input actions!')
# input_actions[0] == 1: do nothing
# input_actions[1] == 1: flap the bird
if input_actions[1] == 1:
if self.playery > -2 * PLAYER_HEIGHT:
self.playerVelY = self.playerFlapAcc
self.playerFlapped = True
#SOUNDS['wing'].play()
# check for score
playerMidPos = self.playerx + PLAYER_WIDTH / 2
for pipe in self.upperPipes:
pipeMidPos = pipe['x'] + PIPE_WIDTH / 2
if pipeMidPos <= playerMidPos < pipeMidPos + 4:
self.score += 1
#SOUNDS['point'].play()
reward = 1
# playerIndex basex change
if (self.loopIter + 1) % 3 == 0:
self.playerIndex = next(PLAYER_INDEX_GEN)
self.loopIter = (self.loopIter + 1) % 30
self.basex = -((-self.basex + 100) % self.baseShift)
# player's movement
if self.playerVelY < self.playerMaxVelY and not self.playerFlapped:
self.playerVelY += self.playerAccY
if self.playerFlapped:
self.playerFlapped = False
self.playery += min(self.playerVelY, BASEY - self.playery - PLAYER_HEIGHT)
if self.playery < 0:
self.playery = 0
# move pipes to left
for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
uPipe['x'] += self.pipeVelX
lPipe['x'] += self.pipeVelX
# add new pipe when first pipe is about to touch left of screen
if 0 < self.upperPipes[0]['x'] < 5:
newPipe = getRandomPipe()
self.upperPipes.append(newPipe[0])
self.lowerPipes.append(newPipe[1])
# remove first pipe if its out of the screen
if self.upperPipes[0]['x'] < -PIPE_WIDTH:
self.upperPipes.pop(0)
self.lowerPipes.pop(0)
# check if crash here
isCrash= checkCrash({'x': self.playerx, 'y': self.playery,
'index': self.playerIndex},
self.upperPipes, self.lowerPipes)
if isCrash:
#SOUNDS['hit'].play()
#SOUNDS['die'].play()
terminal = True
#重新初始化
self.__init__()
reward = -3
# draw sprites
SCREEN.blit(IMAGES['background'], (0,0))
for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
SCREEN.blit(IMAGES['pipe'][0], (uPipe['x'], uPipe['y']))
SCREEN.blit(IMAGES['pipe'][1], (lPipe['x'], lPipe['y']))
SCREEN.blit(IMAGES['base'], (self.basex, BASEY))
# print score so player overlaps the score
# showScore(self.score)
SCREEN.blit(IMAGES['player'][self.playerIndex],
(self.playerx, self.playery))
image_data = pygame.surfarray.array3d(pygame.display.get_surface())
pygame.display.update()
FPSCLOCK.tick(FPS)
#print self.upperPipes[0]['y'] + PIPE_HEIGHT - int(BASEY * 0.2)
return image_data, reward, terminal
def getRandomPipe():
"""returns a randomly generated pipe"""
# y of gap between upper and lower pipe
gapYs = [20, 30, 40, 50, 60, 70, 80, 90]
index = random.randint(0, len(gapYs)-1)
gapY = gapYs[index]
gapY += int(BASEY * 0.2)
pipeX = SCREENWIDTH + 10
return [
{'x': pipeX, 'y': gapY - PIPE_HEIGHT}, # upper pipe
{'x': pipeX, 'y': gapY + PIPEGAPSIZE}, # lower pipe
]
def showScore(score):
"""displays score in center of screen"""
scoreDigits = [int(x) for x in list(str(score))]
totalWidth = 0 # total width of all numbers to be printed
for digit in scoreDigits:
totalWidth += IMAGES['numbers'][digit].get_width()
Xoffset = (SCREENWIDTH - totalWidth) / 2
for digit in scoreDigits:
SCREEN.blit(IMAGES['numbers'][digit], (Xoffset, SCREENHEIGHT * 0.1))
Xoffset += IMAGES['numbers'][digit].get_width()
def checkCrash(player, upperPipes, lowerPipes):
"""returns True if player collders with base or pipes."""
pi = player['index']
player['w'] = IMAGES['player'][0].get_width()
player['h'] = IMAGES['player'][0].get_height()
# if player crashes into ground
if player['y'] + player['h'] >= BASEY - 1:
return True
else:
playerRect = pygame.Rect(player['x'], player['y'],
player['w'], player['h'])
for uPipe, lPipe in zip(upperPipes, lowerPipes):
# upper and lower pipe rects
uPipeRect = pygame.Rect(uPipe['x'], uPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)
lPipeRect = pygame.Rect(lPipe['x'], lPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)
# player and upper/lower pipe hitmasks
pHitMask = HITMASKS['player'][pi]
uHitmask = HITMASKS['pipe'][0]
lHitmask = HITMASKS['pipe'][1]
# if bird collided with upipe or lpipe
uCollide = pixelCollision(playerRect, uPipeRect, pHitMask, uHitmask)
lCollide = pixelCollision(playerRect, lPipeRect, pHitMask, lHitmask)
if uCollide or lCollide:
return True
return False
def pixelCollision(rect1, rect2, hitmask1, hitmask2):
"""Checks if two objects collide and not just their rects"""
rect = rect1.clip(rect2)
if rect.width == 0 or rect.height == 0:
return False
x1, y1 = rect.x - rect1.x, rect.y - rect1.y
x2, y2 = rect.x - rect2.x, rect.y - rect2.y
for x in range(rect.width):
for y in range(rect.height):
if hitmask1[x1+x][y1+y] and hitmask2[x2+x][y2+y]:
return True
return False
flappy_bird_utils.py
import pygame
import sys
def load():
# path of player with different states
PLAYER_PATH = (
'assets/sprites/redbird-upflap.png',
'assets/sprites/redbird-midflap.png',
'assets/sprites/redbird-downflap.png'
)
# path of background
BACKGROUND_PATH = 'assets/sprites/background-black.png'
# path of pipe
PIPE_PATH = 'assets/sprites/pipe-green.png'
IMAGES, SOUNDS, HITMASKS = {}, {}, {}
# numbers sprites for score display
IMAGES['numbers'] = (
pygame.image.load('assets/sprites/0.png').convert_alpha(),
pygame.image.load('assets/sprites/1.png').convert_alpha(),
pygame.image.load('assets/sprites/2.png').convert_alpha(),
pygame.image.load('assets/sprites/3.png').convert_alpha(),
pygame.image.load('assets/sprites/4.png').convert_alpha(),
pygame.image.load('assets/sprites/5.png').convert_alpha(),
pygame.image.load('assets/sprites/6.png').convert_alpha(),
pygame.image.load('assets/sprites/7.png').convert_alpha(),
pygame.image.load('assets/sprites/8.png').convert_alpha(),
pygame.image.load('assets/sprites/9.png').convert_alpha()
)
# base (ground) sprite
IMAGES['base'] = pygame.image.load('assets/sprites/base.png').convert_alpha()
# sounds
if 'win' in sys.platform:
soundExt = '.wav'
else:
soundExt = '.ogg'
SOUNDS['die'] = pygame.mixer.Sound('assets/audio/die' + soundExt)
SOUNDS['hit'] = pygame.mixer.Sound('assets/audio/hit' + soundExt)
SOUNDS['point'] = pygame.mixer.Sound('assets/audio/point' + soundExt)
SOUNDS['swoosh'] = pygame.mixer.Sound('assets/audio/swoosh' + soundExt)
SOUNDS['wing'] = pygame.mixer.Sound('assets/audio/wing' + soundExt)
# select random background sprites
IMAGES['background'] = pygame.image.load(BACKGROUND_PATH).convert()
# select random player sprites
IMAGES['player'] = (
pygame.image.load(PLAYER_PATH[0]).convert_alpha(),
pygame.image.load(PLAYER_PATH[1]).convert_alpha(),
pygame.image.load(PLAYER_PATH[2]).convert_alpha(),
)
# select random pipe sprites
IMAGES['pipe'] = (
pygame.transform.rotate(
pygame.image.load(PIPE_PATH).convert_alpha(), 180),
pygame.image.load(PIPE_PATH).convert_alpha(),
)
# hismask for pipes
HITMASKS['pipe'] = (
getHitmask(IMAGES['pipe'][0]),
getHitmask(IMAGES['pipe'][1]),
)
# hitmask for player
HITMASKS['player'] = (
getHitmask(IMAGES['player'][0]),
getHitmask(IMAGES['player'][1]),
getHitmask(IMAGES['player'][2]),
)
return IMAGES, SOUNDS, HITMASKS
def getHitmask(image):
"""returns a hitmask using an image's alpha."""
mask = []
for x in range(image.get_width()):
mask.append([])
for y in range(image.get_height()):
mask[x].append(bool(image.get_at((x,y))[3]))
return mask
assets文件包
链接:https://pan.baidu.com/s/1ctJF5_X6MCOBWX96WkaO7w
提取码:jg08
line 65
self.sess = tf.Session()
Session
是 Tensorflow 为了控制,和输出文件的执行的语句. 运行 session.run()
可以获得你要得知的运算结果, 或者是你所要运算的部分.line 68
self.obs = tf.placeholder(tf.float32,shape = [None,80,80,4])
tf.placeholder(
dtype,
shape=None,
name=None
)
line 73
self.Q = self.build_q_net(self.obs,scope='eval',trainable=True)
self.Q_ = self.build_q_net(self.obs_,scope='target',trainable=False)
line 77
self.qe_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='eval')# 获取训练变量
等价于
self.qe_params=[var for var in train_vars if var.name.startswith('eval')]
line 84
readout_q = tf.reduce_sum(tf.multiply(self.Q,self.action),reduction_indices=1)
tf.reduce_sum(
input_tensor,
axis=None,
keepdims=None,
name=None,
reduction_indices=None,
keep_dims=None)
line 85
self.q_loss = tf.losses.mean_squared_error(labels=self.Q_target,predictions = readout_q)
tf.losses.mean_squared_error(
labels,
predictions,
weights=1.0,
scope=None,
loss_collection=tf.GraphKeys.LOSSES,
reduction=Reduction.SUM_BY_NONZERO_WEIGHTS
)
line 87
self.q_train_op = tf.train.AdamOptimizer(lr).minimize(self.q_loss,var_list=self.qe_params)
tf.train.AdamOptimizer.__init__(
learning_rate=lr,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
use_locking=False,
name='Adam'
)
line 89
self.sess.run(tf.global_variables_initializer())
sess.run(tf.global_variables_initializer())
,用来初始化变量。line 91
self.saver = tf.train.Saver()
with tf.Session() as sess:
这一句的前面写就行,不用再Session()会话中saver.save(sess,check_dir + 'model.ckpt')
line 97
self.saver.save(self.sess,model_path,global_step=global_step)
save(
sess,
save_path,
global_step=None,
latest_filename=None,
meta_graph_suffix='meta',
write_meta_graph=True,
write_state=True,
strip_default_attrs=False)
line 101
self.saver.restore(self.sess,model_path)
restore(
sess,
save_path)
line 113
with tf.variable_scope(scope):{}
原型:
def variable_scope(name_or_scope,
default_name=None,
values=None,
initializer=None,
regularizer=None,
caching_device=None,
partitioner=None,
custom_getter=None,
reuse=None,
dtype=None,
use_resource=None)
line 115
h_conv1 = tf.layers.conv2d(inputs=obs,
filters=32,
kernel_size=[8*8],
strides=4,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(mean=0,stddev=0.1),
bias_initializer=tf.constant_initializer(0.1),
trainable=trainable)
原型
conv2d(inputs, filters, kernel_size,
strides=(1, 1),
padding='valid',
data_format='channels_last',
dilation_rate=(1, 1),
activation=None,
use_bias=True,
kernel_initializer=None,
bias_initializer=<tensorflow.python.ops.init_ops.Zeros object at 0x000002596A1FD898>,
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
trainable=True,
name=None,
reuse=None)
其中
kernel_initializer=tf.random_normal_initializer(mean=0,stddev=0.1),
line 120
h_pool1 = tf.layers.max_pooling2d(h_conv1,pool_size=[2*2],strides=2,padding="same")
tf.layers.max_pooling2d(
inputs,
pool_size,
strides,
padding='valid',
data_format='channels_last',
name=None
)
line 127
h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
tf.reshape(tensor,shape,name=None)
line 134
h_fc1 = tf.layers.dense(inputs=h_conv3_flat,
units=512,
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0,stddev=0.1),
bias_initializer=tf.constant_initializer(0.1),
trainable=trainable)
line 160
a_t[random.randrange(ACTIONS)] = 1
random.randrange(start, stop, step)
line 174
x_t = cv2.cvtColor(cv2.resize(x_t,(80,80)),
cv2.COLOR_BGR2GRAY)
image_hsv = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
其中
cv2.resize(src,dsize,dst=None,fx=None,fy=None,interpolation=None)
line 175
ret,x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
cv2.threshold(src,thresh,maxval,type[,dst])
阀值 | 小于阀值的像素点 | 大于阀值的像素点 |
---|---|---|
0 | 置0 | 置填充色 |
1 | 置填充色 | 置0 |
2 | 保持原色 | 置阀值 |
3 | 置0 | 保持原色 |
4 | 保持原色 | 置0 |
line 176
s_t = np.stack((x_t,x_t,x_t,x_t),axis=2)
numpy.stack(arrays, axis)
line 190
s_t1 =np.append(x_t1,s_t[:,:,:3],axis=2)
np.append(arr, values, axis=None)
line 207
self.sess.run(self.q_train_op,feed_dict={self.obs:train_s,self.action:train_a,self.Q_target:target_q})
tmp = self.sess.run(self.out,
feed_dict={self.input:[x[iself.img_size:(i+1)self.img_size,
jself.img_size:(j+1)self.img_size]]})[0]