该项目基于PaddlePaddle框架完成,详情见玩转Atari-Pong游戏
You control the right paddle, you compete against the left paddle controlled by the computer. You each try to keep deflecting the ball away from your goal and into your opponent’s goal.
翻译成中文就是:
你控制右边的球拍,你与电脑控制的左边的球拍竞争。你们各自努力使球不断偏离自己的目标,进入对手的目标。
游戏示意图:
从该动态图可以看出,不经训练的右侧球拍完全打不过左侧球拍的,因此我们的目标就是训练右侧球拍使其战胜左侧球拍。
Pong环境的状态、动作与奖励:
Box(210, 160, 3)
,也就是3通道的彩色图Discrete(6)
,也就是离散的6个动作。网上有介绍:Pong 环境介绍,提到其实6个动作中有用的只有3个,可以参考该介绍,加深理解。训练结果展示:
在运行man.ipynb之前,请先运行help.ipynb生成我们的依赖环境!!!
目前Ai studio平台并没有内嵌Atari环境,需要我们自行安装,为避免反复安装,我们将安装过程写到help.ipynb。可运行我们的help.ipynb
进行持久化安装。主要的安装命令如下所示:
其中需要注意:第4、5条安装命令可能无法一次成功,多运行几次即可;第6条命令一个项目仅运行一次即可。
注意要先将我们自行安装的Atari环境加入到系统中,即
sys.path.append(‘/home/aistudio/external-libraries’)
import sys
sys.path.append('/home/aistudio/external-libraries')
import gym
import numpy as np
import time
import matplotlib.pyplot as plt
import paddle
import os
from collections import deque,Counter
from visualdl import LogWriter
import copy
from collections import Counter
from matplotlib import animation
from PIL import Image
检测我们是否可以成功加载环境,并查看我们的状态空间和动作空间
env = gym.make('Pong-v4')
print(env.observation_space)
print(env.action_space)
Box(210, 160, 3)
Discrete(6)
在这里我们首先定义了状态的预处理函数preprocess
,该函数说明如下:
def preprocess(image):
""" 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """
image = image[35:195] # 裁剪
image = image[::2, ::2, 0] # 下采样,缩放2倍
image[image == 144] = 0 # 擦除背景 (background type 1)
image[image == 109] = 0 # 擦除背景
image[image != 0] = 1 # 转为灰度图,除了黑色外其他都是白色
return image.astype(np.float).ravel() #打平,(6400,)
def show_image(status):
status1=status[35:195] #裁剪有效区域
status2 = status1[::2, ::2, 0] #下采样,缩减
# 观察我们的像素点构成
def see_color(status):
allcolor=[]
for i in range(80):
allcolor.extend(status[i])
dict_color=Counter(allcolor)
print("像素点构成: ",dict_color)
see_color(status2)
# 观察好像素点后,擦除背景
def togray(image_in):
image=image_in.copy()
image[image == 144] = 0 # 擦除背景 (background type 1)
image[image == 109] = 0 # 擦除背景
image[image != 0] = 1 # 转为灰度图,除了黑色外其他都是白色
return image
status3=togray(status2)
# 可视化我们的操作中间图
def show_status(list_status):
fig = plt.figure(figsize=(8, 32), dpi=200)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None,wspace=0.3, hspace=0)
for i in range(len(list_status)):
plt.subplot(1,len(list_status),i+1)
plt.imshow(list_status[i],cmap=plt.cm.binary)
plt.show()
show_status([status,status1,status2,status3])
status = env.reset() #原始图
show_image(status)
像素点构成: Counter({109: 6382, 101: 16, 53: 2})
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/cbook/__init__.py:2349: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
if isinstance(obj, collections.Iterator):
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/cbook/__init__.py:2366: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
return list(data) if isinstance(data, collections.MappingView) else data
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/image.py:425: DeprecationWarning: np.asscalar(a) is deprecated since NumPy v1.16, use a.item() instead
a_min = np.asscalar(a_min.astype(scaled_dtype))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/image.py:426: DeprecationWarning: np.asscalar(a) is deprecated since NumPy v1.16, use a.item() instead
a_max = np.asscalar(a_max.astype(scaled_dtype))
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EavJ8qDW-1668436245730)(main_files/main_13_2.png)]
for i in range(200):
action=env.action_space.sample()
status,reward,done,info=env.step(action)
show_image(status)
像素点构成: Counter({144: 6366, 213: 16, 92: 16, 236: 2})
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fYpFJYfP-1668436245731)(main_files/main_15_1.png)]
class Model(paddle.nn.Layer):
""" 使用全连接网络.
参数:
obs_dim (int): 观测空间的维度.
act_dim (int): 动作空间的维度.
"""
def __init__(self, obs_dim, act_dim):
super(Model, self).__init__()
hid1_size = 256
hid2_size = 64
self.fc1 = paddle.nn.Linear(obs_dim, hid1_size)
self.fc2 = paddle.nn.Linear(hid1_size, hid2_size)
self.fc3 = paddle.nn.Linear(hid2_size, act_dim)
def forward(self, obs):
h1 = paddle.nn.functional.relu(self.fc1(obs))
h2 = paddle.nn.functional.relu(self.fc2(h1))
prob = paddle.nn.functional.softmax(self.fc3(h2), axis=-1)
return prob
强化学习的经典算法之一,可以参考我们之前的项目【强化学习】REINFORCE算法
在这里我们仅定义预测
和更新
两个函数。
# 梯度下降算法
class PolicyGradient():
def __init__(self, model, lr):
self.model = model
self.optimizer = paddle.optimizer.Adam(learning_rate=lr, parameters=self.model.parameters())
def predict(self, obs):
prob = self.model(obs)
return prob
def learn(self, obs, action, reward):
prob = self.model(obs)
#print("prob: ",prob)
log_prob = paddle.distribution.Categorical(prob).log_prob(action)
loss = paddle.mean(-1 * log_prob * reward)
self.optimizer.clear_grad()
loss.backward()
self.optimizer.step()
return loss
class Agent():
def __init__(self, algorithm):
self.alg=algorithm
if os.path.exists("./savemodel"):
print("开始从文件加载参数....")
try:
self.load()
print("从文件加载参数结束....")
except:
print("从文件加载参数失败,从0开始训练....")
def sample(self, obs):
""" 根据观测值 obs 采样(带探索)一个动作
"""
obs = paddle.to_tensor(obs, dtype='float32')
prob = self.alg.predict(obs)
#print("prob:",prob)
prob = prob.numpy()
act = np.random.choice(len(prob), 1, p=prob)[0] # 根据动作概率选取动作
return act
def predict(self, obs):
""" 根据观测值 obs 选择最优动作
"""
obs = paddle.to_tensor(obs, dtype='float32')
prob = self.alg.predict(obs)
act = prob.argmax().numpy()[0] # 根据动作概率选择概率最高的动作
return act
def learn(self, obs, act, reward):
""" 根据训练数据更新一次模型参数
"""
act = np.expand_dims(act, axis=-1)
reward = np.expand_dims(reward, axis=-1)
obs = paddle.to_tensor(obs, dtype='float32')
act = paddle.to_tensor(act, dtype='int32')
reward = paddle.to_tensor(reward, dtype='float32')
#print("gggggggggggggg",obs.shape,act.shape,reward.shape)
loss = self.alg.learn(obs, act, reward)
return loss.numpy()[0]
def save(self):
paddle.save(self.alg.model.state_dict(),'./savemodel/PG-Pong_net.pdparams')
paddle.save(self.alg.optimizer.state_dict(), "./savemodel/opt.pdopt")
def load(self):
# 加载网络参数
model_state_dict=paddle.load('./savemodel/PG-Pong_net.pdparams')
self.alg.model.set_state_dict(model_state_dict)
# # 加载优化器参数
# optimizer_state_dict=paddle.load("./savemodel/opt.pdopt")
# self.alg.optimizer.set_state_dict(optimizer_state_dict)
# 训练一个episode
def run_train_episode(agent, env):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()
while True:
obs = preprocess(obs) # from shape (210, 160, 3) to (6400,)
obs_list.append(obs)
action = agent.sample(obs)
action_list.append(action)
obs, reward, done, info = env.step(action)
# if reward!=0:
# print("reward: ",action)
reward_list.append(reward)
if done:
break
return obs_list, action_list, reward_list
# 评估 agent, 跑 5 个episode,总reward求平均
def run_evaluate_episodes(agent, env, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
obs = preprocess(obs) # from shape (210, 160, 3) to (6400,)
action = agent.predict(obs)
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if isOver:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
进行奖励衰减操作,衰减因子gamma默认为0.99
def calc_reward_to_go(reward_list, gamma=0.99):
"""calculate discounted reward"""
reward_arr = np.array(reward_list)
for i in range(len(reward_arr) - 2, -1, -1):
# G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
reward_arr[i] += gamma * reward_arr[i + 1]
# normalize episode rewards
reward_arr -= np.mean(reward_arr)
reward_arr /= np.std(reward_arr)
return reward_arr
便于演示,我们仅进行100次的继续训练,读者可自行增加次数以获得更好的训练效果
def main():
env = gym.make('Pong-v4')
obs_dim = 80 * 80
act_dim = env.action_space.n
print('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 根据parl框架构建agent
LEARNING_RATE = 5e-4
model = Model(obs_dim=obs_dim, act_dim=act_dim)
alg = PolicyGradient(model, lr=LEARNING_RATE)
agent = Agent(alg)
twriter=LogWriter('./logs/PG_Pong')
for i in range(100): # default 3000
obs_list, action_list, reward_list = run_train_episode(agent, env)
twriter.add_scalar('reward',sum(reward_list),i)
if i % 50 == 0:
print("Episode {}, Reward Sum {}.".format(i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
#print("ggggggggggggg",batch_obs.shape)
agent.learn(batch_obs, batch_action, batch_reward)
last_test_total_reward=0
if (i + 1) % 100 == 0:
# render=True 查看显示效果
total_reward = run_evaluate_episodes(agent, env, render=False)
print('Test reward: {}'.format(total_reward))
# save the parameters
if last_test_total_reward<total_reward:
last_test_total_reward=total_reward
agent.save()
# 运行整个程序
main()
obs_dim 6400, act_dim 6
W1022 22:01:06.998914 174 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W1022 22:01:07.003042 174 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.
开始从文件加载参数....
从文件加载参数结束....
Episode 0, Reward Sum 14.0.
Episode 50, Reward Sum 8.0.
Test reward: 12.0
def save_frames_as_gif(frames, filename):
#Mess with this to change frame size
plt.figure(figsize=(frames[0].shape[1]/100, frames[0].shape[0]/100), dpi=300)
patch = plt.imshow(frames[0])
plt.axis('off')
def animate(i):
patch.set_data(frames[i])
anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
anim.save(filename, writer='pillow', fps=60)
model=Model(6400,6)
model_state_dict=paddle.load("./savemodel/PG-Pong_net.pdparams")
model.set_state_dict(model_state_dict)
env=gym.make('Pong-v4')
state=env.reset()
frames = []
done=0
i=0
reward_list=[]
while not done:
frames.append(env.render(mode="rgb_array"))
obs = preprocess(state)
obs = paddle.to_tensor(obs, dtype='float32')
prob = model(obs)
action = prob.argmax().numpy()[0]
next_state,reward,done,_=env.step(action)
if reward!=0:
reward_list.append(reward)
print(i," ",reward,done)
state=next_state
i+=1
reward_counter=Counter(reward_list)
print(reward_counter)
print("你的得分为:",reward_counter[1.0],'对手得分为:',reward_counter[-1.0])
if reward_counter[1.0]>reward_counter[-1.0]:
print("恭喜您赢了!!!")
else:
print("惜败,惜败,训练一下智能体网络再来挑战吧QWQ")
save_frames_as_gif(frames, filename="Pong-v4_trained.gif")
env.close()
199 1.0 False
732 1.0 False
937 1.0 False
1547 1.0 False
1676 1.0 False
1877 1.0 False
2165 1.0 False
2451 1.0 False
2575 1.0 False
2705 1.0 False
2995 1.0 False
3125 1.0 False
3331 1.0 False
3454 1.0 False
3584 1.0 False
3793 1.0 False
4885 1.0 False
5096 1.0 False
5698 1.0 False
5992 1.0 False
6202 1.0 True
Counter({1.0: 21})
你的得分为: 21 对手得分为: 0
恭喜您赢了!!!
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yCQ3yFtg-1668436245733)(main_files/main_37_1.png)]
本项目参考自飞桨PARL,鼓励大家给点点stars
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-adwdBVQe-1668436245734)(https://ai-studio-static-online.cdn.bcebos.com/92d792700be949219afc12e2d76920190d929c42685e4d29917d3b34fd86fec7)]
本项目目前通过5000+回合的训练,我们的智能体已经学会通过快速抖动法
取得游戏的胜利了,但是大概率还不能完全碾压,后续有时间会继续训练或采取更加高效的算法进行改进。然后,这是我的第一个Atari游戏项目,之前都在在经典的控制游戏下进行实验,环境的转变使得学习的难度也上升,训练时间也在增加,学到的东西也在增加,挺好的…还请大佬多多指教,小黑还有很多路要走,嘿嘿!
之前的强化学习项目有:
欢迎大家来交流学习!!!
tionType=1&shared=1)
欢迎大家来交流学习!!!