确定环境中的最佳操作的规则叫做策略,学习这些策略的网络称为策略网络。
import numpy as np
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
#Pong env
env = gym.make("Pong-v0")
observation = env.reset()
for i in range(22):
#20 帧之后发球
if i>20:
plt.imshow(observation)
plt.show()
#得到下一个观察
observation,_,_,_ = env.step(1)
#函数预处理输入数据
def preprocess_frame(frame):
# 移去图像顶部和某些背景
frame = frame[35:195,10:150]
# 图像帧度灰度化并缩小1/2
frame = frame [::2,::2,0]
# 设置背景值为0
frame[frame==144] =0
frame[frame ==109] = 0
# 设置球拍及拍数为1
frame[frame != 0] =1
return frame.astype(np.float).ravel()
obs_preprocessed = preprocess_frame(observation).reshape(80,70)
plt.imshow(obs_preprocessed,cmap ='gray')
plt.show()
observation_next,_,_,_ = env.step(1)
diff = preprocess_frame(observation_next) - preprocess_frame(observation)
plt.imshow(diff.reshape(80,70),cmap='gray')
plt.show()
input_dim = 80*70
hidden_L1 = 400
hidden_L2 = 200
actions = [1,2,3]
n_actions = len(actions)
model = {}
with tf.compat.v1.variable_scope('L1',reuse=False):
inint_W1 = tf.compat.v1.truncated_normal_initializer(mean = 0,stddev=1./np.sqrt(input_dim),dtype=tf.float32)
model['W1'] = tf.compat.v1.get_variable('W1',[input_dim,hidden_L1],initializer=inint_W1)
with tf.compat.v1.variable_scope('L2',reuse=False):
init_W2 = tf.compat.v1.truncated_normal_initializer(mean = 0,stddev=1./np.sqrt(hidden_L1),dtype=tf.float32)
model['W2']= tf.compat.v1.get_variable('W2',[hidden_L1,n_actions],initializer=init_W2)
#策略函数
def policy_forward(x):
tf.compat.v1.disable_eager_execution()
x = tf.matmul(x,model['W1'])
x = tf.nn.relu(x)
x = tf.matmul(x,model['W2'])
p = tf.nn.softmax(x)
return p
#折扣奖励函数
def discounted_rewards(reward,gamma):
discounted_function = lambda a,v:a*gamma +v;
reward_reverse = tf.scan(discounted_function,tf.reverse(reward,[True,False]))
discounted_reward = tf.reverse(reward_reverse,[True,False])
return discounted_reward
learning_rate = 0.001
gamma = 0.99
batch_size = 10
tf.compat.v1.disable_eager_execution()
#定义占位符并单独设置反向更新
episode_x = tf.compat.v1.placeholder(dtype=tf.float32,shape=[None,input_dim])
episode_y = tf.compat.v1.placeholder(dtype=tf.float32,shape = [None,n_actions])
episode_reward = tf.compat.v1.placeholder(dtype = tf.float32,shape=[None,1])
episode_discounted_reward = discounted_rewards(episode_reward,gamma)
episode_mean,episode_variance = tf.nn.moments(episode_discounted_reward,[0],shift = None)
#标准化折扣后的收益
episode_discounted_reward -= episode_mean
episode_discounted_reward /= tf.sqrt(episode_variance + 1e-6)
#优化器设定
tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()
tf_aprob = policy_forward(episode_x)
loss = tf.nn.l2_loss(episode_y - tf_aprob)
optimizer = tf.compat.v1.train.AdadeltaOptimizer(learning_rate)
gradients = optimizer.compute_gradients(loss,var_list= tf.compat.v1.trainable_variables(),grad_loss = episode_discounted_reward)
train_op = optimizer.apply_gradients(gradients)
#图像初始化
sess = tf.compat.v1.InteractiveSession()
tf.compat.v1.global_variables_initializer().run()
#训练模型存储设定
saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
save_path = 'checkpoints/pong_rl.ckt'
obs_prev = None
xs,ys,rs = [],[],[]
reward_sum = 0
episode_number = 0
reward_window = None
reward_best = -22
history = []
observation = env.reset()
while True:
if True:
env.render()
#预处理观测值,加载不同的图像给网络
obs_cur = preprocess_frame(observation)
obs_diff = obs_cur - obs_prev if obs_prev is not None else np.zeros(input_dim)
obs_prev = obs_cur
#策略采样一次动作
feed = {episode_x:np.reshape(obs_diff,(1,-1))}
aprob = sess.run(tf_aprob,feed)
aprob = aprob[0,:]
action = np.random.choice(n_actions,p=aprob)
label = np.zeros_like(aprob)
label[action] =1
#返回环境动作并提取下一个观测,回报和状态
observation,reward,done,info = env.step(action +1)
if done:
observation = env.reset()
reward_sum += reward
#记录游戏历史
xs.append(obs_diff)
ys.append(label)
rs.append(reward)
if done:
history.append(reward_sum)
reward_window = -21 if reward_window is None else np.mean(history[-100:])
#用存储值更新权重 - 更新策略
feed = {episode_x : np.vstack(xs),episode_y:np.vstack(ys),episode_reward:np.vstack(rs),}
_ = sess.run(train_op,feed)
print('epochs {:2d}: reward :{:2.0f}'.format(episode_number,reward_sum))
xs,ys,rs = [],[],[]
episode_number += 1
observation = env.reset()
reward_sum = 0
#10个场景后存储最佳模型
if (episode_number % 10 == 0) & (reward_window > reward_best):
saver.save(sess,save_path,global_step=episode_number)
reward_best = reward_window
print('save best model {:2d}:{:2.5f} (reward window)'.format(episode_number,reward_window))
《Python深度学习实战:75个有关神经网络建模、强化学习与迁移》