【强化学习】在Pong环境下实现策略梯度

问题描述:

        确定环境中的最佳操作的规则叫做策略,学习这些策略的网络称为策略网络。

代码展示:

import numpy as np
import gym
import tensorflow as tf
import matplotlib.pyplot as plt

#Pong env
env = gym.make("Pong-v0")
observation = env.reset()
for i in range(22):
    #20 帧之后发球
    if i>20:
        plt.imshow(observation)
        plt.show()
    #得到下一个观察
    observation,_,_,_ = env.step(1)


#函数预处理输入数据
def preprocess_frame(frame):
    # 移去图像顶部和某些背景
    frame = frame[35:195,10:150] 
    # 图像帧度灰度化并缩小1/2
    frame = frame [::2,::2,0]
    # 设置背景值为0
    frame[frame==144] =0
    frame[frame ==109] = 0
    # 设置球拍及拍数为1
    frame[frame != 0] =1
    return frame.astype(np.float).ravel()

obs_preprocessed = preprocess_frame(observation).reshape(80,70)
plt.imshow(obs_preprocessed,cmap ='gray')
plt.show()



observation_next,_,_,_ = env.step(1)
diff = preprocess_frame(observation_next) - preprocess_frame(observation)
plt.imshow(diff.reshape(80,70),cmap='gray')
plt.show()

input_dim = 80*70
hidden_L1 = 400
hidden_L2 = 200
actions = [1,2,3]
n_actions = len(actions)
model = {}
with tf.compat.v1.variable_scope('L1',reuse=False):
    inint_W1 = tf.compat.v1.truncated_normal_initializer(mean = 0,stddev=1./np.sqrt(input_dim),dtype=tf.float32)
    model['W1'] = tf.compat.v1.get_variable('W1',[input_dim,hidden_L1],initializer=inint_W1)

with tf.compat.v1.variable_scope('L2',reuse=False):
    init_W2 = tf.compat.v1.truncated_normal_initializer(mean = 0,stddev=1./np.sqrt(hidden_L1),dtype=tf.float32)
    model['W2']= tf.compat.v1.get_variable('W2',[hidden_L1,n_actions],initializer=init_W2)

#策略函数
def policy_forward(x):
    tf.compat.v1.disable_eager_execution()
    x = tf.matmul(x,model['W1'])
    x = tf.nn.relu(x)
    x = tf.matmul(x,model['W2'])
    p = tf.nn.softmax(x)
    return p

#折扣奖励函数
def discounted_rewards(reward,gamma):
    discounted_function = lambda a,v:a*gamma +v;
    reward_reverse = tf.scan(discounted_function,tf.reverse(reward,[True,False]))
    discounted_reward = tf.reverse(reward_reverse,[True,False])
    return discounted_reward

learning_rate = 0.001
gamma = 0.99
batch_size = 10


tf.compat.v1.disable_eager_execution()
#定义占位符并单独设置反向更新
episode_x = tf.compat.v1.placeholder(dtype=tf.float32,shape=[None,input_dim])
episode_y  = tf.compat.v1.placeholder(dtype=tf.float32,shape = [None,n_actions])
episode_reward = tf.compat.v1.placeholder(dtype = tf.float32,shape=[None,1])

episode_discounted_reward = discounted_rewards(episode_reward,gamma)
episode_mean,episode_variance = tf.nn.moments(episode_discounted_reward,[0],shift = None)

#标准化折扣后的收益
episode_discounted_reward -= episode_mean
episode_discounted_reward /= tf.sqrt(episode_variance + 1e-6)

#优化器设定
tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

tf_aprob = policy_forward(episode_x)
loss = tf.nn.l2_loss(episode_y - tf_aprob)
optimizer = tf.compat.v1.train.AdadeltaOptimizer(learning_rate)
gradients = optimizer.compute_gradients(loss,var_list= tf.compat.v1.trainable_variables(),grad_loss = episode_discounted_reward)
train_op = optimizer.apply_gradients(gradients)

#图像初始化
sess = tf.compat.v1.InteractiveSession()
tf.compat.v1.global_variables_initializer().run()

#训练模型存储设定

saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
save_path = 'checkpoints/pong_rl.ckt'



obs_prev = None
xs,ys,rs = [],[],[]
reward_sum = 0
episode_number = 0
reward_window = None
reward_best = -22
history = []

observation = env.reset()
while True:
    if True:
        env.render()
    #预处理观测值,加载不同的图像给网络
    obs_cur = preprocess_frame(observation)
    obs_diff = obs_cur - obs_prev if obs_prev is not None else np.zeros(input_dim)
    obs_prev = obs_cur

    #策略采样一次动作
    feed = {episode_x:np.reshape(obs_diff,(1,-1))}
    aprob = sess.run(tf_aprob,feed)
    aprob = aprob[0,:]
    action = np.random.choice(n_actions,p=aprob)
    label = np.zeros_like(aprob)
    label[action] =1

    #返回环境动作并提取下一个观测,回报和状态
    observation,reward,done,info = env.step(action +1)
    if done:
        observation = env.reset()
    reward_sum += reward
    #记录游戏历史
    xs.append(obs_diff)
    ys.append(label)
    rs.append(reward)

    if done:
        history.append(reward_sum)
        reward_window = -21 if reward_window is None else np.mean(history[-100:])

        #用存储值更新权重 - 更新策略
        feed = {episode_x : np.vstack(xs),episode_y:np.vstack(ys),episode_reward:np.vstack(rs),}
        _ = sess.run(train_op,feed)
        print('epochs {:2d}: reward :{:2.0f}'.format(episode_number,reward_sum))
        xs,ys,rs = [],[],[]
        episode_number += 1
        observation = env.reset()
        reward_sum = 0
        #10个场景后存储最佳模型
        if (episode_number % 10 == 0) & (reward_window > reward_best):
            saver.save(sess,save_path,global_step=episode_number)
            reward_best = reward_window
            print('save best model {:2d}:{:2.5f} (reward window)'.format(episode_number,reward_window))

实现截图:

【强化学习】在Pong环境下实现策略梯度_第1张图片

参考:

《Python深度学习实战:75个有关神经网络建模、强化学习与迁移》

你可能感兴趣的:(python,人工智能)