Agent 必须在两个动作之间做出决定 - 向左或向右移动推车 - 以使连接到它的杆保持直立。
优势演员-评论员算法的流程如下图所示,我们有一个 π \pi π ,有个初始的演员与环境交互,先收集资料。在策略梯度方法里收集资料以后,就来更新策略。但是在演员-评论员算法里面,我们不是直接使用那些资料来更新策略。我们先用这些资料去估计价值函数,可以用时序差分方法或蒙特卡洛方法来估计价值函数。接下来,我们再基于价值函数,使用下式更新 π \pi π 。
∇ R ˉ θ ≈ 1 N ∑ n − 1 N ∑ t − 1 T n ( r t n + V π ( s t + 1 n ) − V π ( s t n ) ) ∇ log p θ ( a t n ∣ s t n ) \nabla \bar{R}_\theta \approx \frac{1}{N} \sum_{n-1}^N \sum_{t-1}^{T_n}\left(r_t^n+V_\pi\left(s_{t+1}^n\right)-V_\pi\left(s_t^n\right)\right) \nabla \log p_\theta\left(a_t^n \mid s_t^n\right) ∇Rˉθ≈N1n−1∑Nt−1∑Tn(rtn+Vπ(st+1n)−Vπ(stn))∇logpθ(atn∣stn)
有了新的 π \pi π 以后,再与环境交互,收集新的资料,去估计价值函数。再用新的价值函数更新策略,更新演员。整个优势演员-评论员算法就是这么运作的。
关于更加详细的优势演员-评论员算法的介绍,请看我之前发的博客:【EasyRL学习笔记】第九章 Actor-Critic 演员-评论员算法
在学习优势演员-评论员算法前你最好能了解以下知识点:
- 深度Q网络
- 时序差分方法
- 蒙特卡洛方法
准备好一个RL_Utils.py文件,文件内容可以从我的一篇里博客获取:【RL工具类】强化学习常用函数工具类(Python代码)
这一步很重要,后面需要引入该RL_Utils.py文件
import argparse
import datetime
import time
from collections import deque
import torch.nn.functional as F
import gym
from torch import nn
# 这里需要改成自己的RL_Utils.py文件的路径
from Python.ReinforcementLearning.EasyRL.RL_Utils import *
# 经验回放缓存区对象
class PGReplay:
def __init__(self):
self.buffer = deque()
def push(self, transitions):
self.buffer.append(transitions)
def sample(self):
batch = list(self.buffer)
return zip(*batch)
def clear(self):
self.buffer.clear()
def __len__(self):
return len(self.buffer)
# 演员:离散动作,就输出每个动作的概率分布(softmax);连续动作就直接输出动作(sigmoid)
class ActorSoftmax(nn.Module):
def __init__(self, input_dim, output_dim, hidden_dim=256):
super(ActorSoftmax, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, state):
dist = F.relu(self.fc1(state))
dist = F.softmax(self.fc2(dist), dim=1)
return dist
# 评论员:输出V_{\pi}(s)
class Critic(nn.Module):
def __init__(self, input_dim, output_dim, hidden_dim=256):
super(Critic, self).__init__()
assert output_dim == 1
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, state):
value = F.relu(self.fc1(state))
value = self.fc2(value)
return value
# A2C智能体对象
class A2C:
def __init__(self, models, memory, arg_dict):
self.n_actions = arg_dict['n_actions']
self.gamma = arg_dict['gamma']
self.device = torch.device(arg_dict['device'])
self.memory = memory
self.actor = models['Actor'].to(self.device)
self.critic = models['Critic'].to(self.device)
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=arg_dict['actor_lr'])
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=arg_dict['critic_lr'])
def sample_action(self, state):
# unsqueeze():升维
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
# 将当前状态传给演员,演员返回每个动作的概率分布
dist = self.actor(state)
# 将当前状态传给评论员,评论员返回每个动作的价值
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
# squeeze():降维
# 根据分布,按照概率选取动作
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action, value, dist
# 这里之所以不用最大概率的动作进行行动,是为了增加模型的随机性,防止被猜透
def predict_action(self, state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action, value, dist
def update(self, next_state, entropy):
value_pool, log_prob_pool, reward_pool = self.memory.sample()
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value = self.critic(next_state)
returns = np.zeros_like(reward_pool)
for t in reversed(range(len(reward_pool))):
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
tot_loss = actor_loss + critic_loss + 0.001 * entropy
self.actor_optim.zero_grad()
self.critic_optim.zero_grad()
tot_loss.backward()
self.actor_optim.step()
self.critic_optim.step()
self.memory.clear()
def save_model(self, path):
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.actor.state_dict(), f"{path}/actor_checkpoint.pt")
torch.save(self.critic.state_dict(), f"{path}/critic_checkpoint.pt")
def load_model(self, path):
self.actor.load_state_dict(torch.load(f"{path}/actor_checkpoint.pt"))
self.critic.load_state_dict(torch.load(f"{path}/critic_checkpoint.pt"))
# 训练函数
def train(arg_dict, env, agent):
# 开始计时
startTime = time.time()
print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
print("开始训练智能体......")
rewards = []
steps = []
for i_ep in range(arg_dict['train_eps']):
ep_reward = 0
ep_step = 0
ep_entropy = 0
state = env.reset()
# 每次采样ep_max_steps个样本再对模型进行更新
for _ in range(arg_dict['ep_max_steps']):
# 画图
if arg_dict['train_render']:
env.render()
# 探索采样
action, value, dist = agent.sample_action(state)
# 根据动作获取下一步状态、奖励(经验)
next_state, reward, done, _ = env.step(action)
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
# 保存经验
agent.memory.push((value, log_prob, reward))
# 更新状态
state = next_state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
# 更新智能体参数
agent.update(next_state, ep_entropy)
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep + 1) % 10 == 0:
print(f'Episode: {i_ep + 1}/{arg_dict["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print('训练结束 , 用时: ' + str(time.time() - startTime) + " s")
# 关闭环境
env.close()
return {'episodes': range(len(rewards)), 'rewards': rewards}
# 测试函数
def test(arg_dict, env, agent):
startTime = time.time()
print("开始测试智能体......")
print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
rewards = []
steps = []
for i_ep in range(arg_dict['test_eps']):
ep_reward = 0
ep_step = 0
state = env.reset()
for _ in range(arg_dict['ep_max_steps']):
# 画图
if arg_dict['test_render']:
env.render()
# 预测动作
action, _, _ = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep + 1}/{arg_dict['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("测试结束 , 用时: " + str(time.time() - startTime) + " s")
env.close()
return {'episodes': range(len(rewards)), 'rewards': rewards}
# 创建环境和智能体
def create_env_agent(arg_dict):
# 创建环境
env = gym.make(arg_dict['env_name'])
# 设置随机种子
all_seed(env, seed=arg_dict["seed"])
# 获取状态数
try:
n_states = env.observation_space.n
except AttributeError:
n_states = env.observation_space.shape[0]
# 获取动作数
n_actions = env.action_space.n
print(f"状态数: {n_states}, 动作数: {n_actions}")
# 将状态数和动作数加入算法参数字典
arg_dict.update({"n_states": n_states, "n_actions": n_actions})
# 实例化智能体对象
models = {
'Actor': ActorSoftmax(arg_dict['n_states'], arg_dict['n_actions'], hidden_dim=arg_dict['actor_hidden_dim']),
'Critic': Critic(arg_dict['n_states'], 1, hidden_dim=arg_dict['critic_hidden_dim'])}
# 经验回放缓存区
memory = PGReplay()
agent = A2C(models, memory, arg_dict)
# 返回环境,智能体
return env, agent
if __name__ == '__main__':
# 防止报错 OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# 获取当前路径
curr_path = os.path.dirname(os.path.abspath(__file__))
# 获取当前时间
curr_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
# 相关参数设置
parser = argparse.ArgumentParser(description="hyper parameters")
parser.add_argument('--algo_name', default='A2C', type=str, help="name of algorithm")
parser.add_argument('--env_name', default='CartPole-v0', type=str, help="name of environment")
parser.add_argument('--train_eps', default=1600, type=int, help="episodes of training")
parser.add_argument('--test_eps', default=20, type=int, help="episodes of testing")
parser.add_argument('--ep_max_steps', default=100000, type=int,
help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma', default=0.99, type=float, help="discounted factor")
parser.add_argument('--actor_lr', default=3e-4, type=float, help="learning rate of actor")
parser.add_argument('--critic_lr', default=1e-3, type=float, help="learning rate of critic")
parser.add_argument('--actor_hidden_dim', default=256, type=int, help="hidden of actor net")
parser.add_argument('--critic_hidden_dim', default=256, type=int, help="hidden of critic net")
parser.add_argument('--device', default='cpu', type=str, help="cpu or cuda")
parser.add_argument('--seed', default=520, type=int, help="seed")
parser.add_argument('--show_fig', default=False, type=bool, help="if show figure or not")
parser.add_argument('--save_fig', default=True, type=bool, help="if save figure or not")
parser.add_argument('--train_render', default=False, type=bool,
help="Whether to render the environment during training")
parser.add_argument('--test_render', default=True, type=bool,
help="Whether to render the environment during testing")
args = parser.parse_args()
default_args = {'result_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
# 将参数转化为字典 type(dict)
arg_dict = {**vars(args), **default_args}
print("算法参数字典:", arg_dict)
# 创建环境和智能体
env, agent = create_env_agent(arg_dict)
# 传入算法参数、环境、智能体,然后开始训练
res_dic = train(arg_dict, env, agent)
print("算法返回结果字典:", res_dic)
# 保存相关信息
agent.save_model(path=arg_dict['model_path'])
save_args(arg_dict, path=arg_dict['result_path'])
save_results(res_dic, tag='train', path=arg_dict['result_path'])
plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="train")
# =================================================================================================
# 创建新环境和智能体用来测试
print("=" * 300)
env, agent = create_env_agent(arg_dict)
# 加载已保存的智能体
agent.load_model(path=arg_dict['model_path'])
res_dic = test(arg_dict, env, agent)
save_results(res_dic, tag='test', path=arg_dict['result_path'])
plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="test")
运行结果展示
由于有些输出太长,下面仅展示部分输出
状态数: 4, 动作数: 2
环境名: CartPole-v0, 算法名: A2C, Device: cpu
开始训练智能体......
Episode: 10/1600, Reward: 25.00, Steps:25
Episode: 20/1600, Reward: 12.00, Steps:12
Episode: 30/1600, Reward: 20.00, Steps:20
Episode: 40/1600, Reward: 14.00, Steps:14
Episode: 50/1600, Reward: 24.00, Steps:24
Episode: 60/1600, Reward: 37.00, Steps:37
Episode: 70/1600, Reward: 40.00, Steps:40
Episode: 80/1600, Reward: 13.00, Steps:13
Episode: 90/1600, Reward: 23.00, Steps:23
Episode: 100/1600, Reward: 14.00, Steps:14
Episode: 110/1600, Reward: 25.00, Steps:25
Episode: 120/1600, Reward: 25.00, Steps:25
Episode: 130/1600, Reward: 22.00, Steps:22
Episode: 140/1600, Reward: 20.00, Steps:20
Episode: 150/1600, Reward: 94.00, Steps:94
Episode: 160/1600, Reward: 19.00, Steps:19
Episode: 170/1600, Reward: 25.00, Steps:25
Episode: 180/1600, Reward: 11.00, Steps:11
Episode: 190/1600, Reward: 36.00, Steps:36
Episode: 200/1600, Reward: 33.00, Steps:33
Episode: 210/1600, Reward: 20.00, Steps:20
Episode: 220/1600, Reward: 17.00, Steps:17
Episode: 230/1600, Reward: 12.00, Steps:12
Episode: 240/1600, Reward: 15.00, Steps:15
Episode: 250/1600, Reward: 31.00, Steps:31
Episode: 260/1600, Reward: 12.00, Steps:12
Episode: 270/1600, Reward: 27.00, Steps:27
Episode: 280/1600, Reward: 40.00, Steps:40
Episode: 290/1600, Reward: 20.00, Steps:20
Episode: 300/1600, Reward: 60.00, Steps:60
Episode: 310/1600, Reward: 38.00, Steps:38
Episode: 320/1600, Reward: 10.00, Steps:10
Episode: 330/1600, Reward: 23.00, Steps:23
Episode: 340/1600, Reward: 34.00, Steps:34
Episode: 350/1600, Reward: 55.00, Steps:55
Episode: 360/1600, Reward: 24.00, Steps:24
Episode: 370/1600, Reward: 45.00, Steps:45
Episode: 380/1600, Reward: 24.00, Steps:24
Episode: 390/1600, Reward: 32.00, Steps:32
Episode: 400/1600, Reward: 92.00, Steps:92
Episode: 410/1600, Reward: 53.00, Steps:53
Episode: 420/1600, Reward: 40.00, Steps:40
Episode: 430/1600, Reward: 77.00, Steps:77
Episode: 440/1600, Reward: 44.00, Steps:44
Episode: 450/1600, Reward: 32.00, Steps:32
Episode: 460/1600, Reward: 51.00, Steps:51
Episode: 470/1600, Reward: 91.00, Steps:91
Episode: 480/1600, Reward: 51.00, Steps:51
Episode: 490/1600, Reward: 66.00, Steps:66
Episode: 500/1600, Reward: 27.00, Steps:27
Episode: 510/1600, Reward: 66.00, Steps:66
Episode: 520/1600, Reward: 37.00, Steps:37
Episode: 530/1600, Reward: 29.00, Steps:29
Episode: 540/1600, Reward: 38.00, Steps:38
Episode: 550/1600, Reward: 82.00, Steps:82
Episode: 560/1600, Reward: 33.00, Steps:33
Episode: 570/1600, Reward: 79.00, Steps:79
Episode: 580/1600, Reward: 78.00, Steps:78
Episode: 590/1600, Reward: 26.00, Steps:26
Episode: 600/1600, Reward: 80.00, Steps:80
Episode: 610/1600, Reward: 85.00, Steps:85
Episode: 620/1600, Reward: 92.00, Steps:92
Episode: 630/1600, Reward: 35.00, Steps:35
Episode: 640/1600, Reward: 88.00, Steps:88
Episode: 650/1600, Reward: 157.00, Steps:157
Episode: 660/1600, Reward: 35.00, Steps:35
Episode: 670/1600, Reward: 60.00, Steps:60
Episode: 680/1600, Reward: 42.00, Steps:42
Episode: 690/1600, Reward: 55.00, Steps:55
Episode: 700/1600, Reward: 51.00, Steps:51
Episode: 710/1600, Reward: 65.00, Steps:65
Episode: 720/1600, Reward: 61.00, Steps:61
Episode: 730/1600, Reward: 125.00, Steps:125
Episode: 740/1600, Reward: 162.00, Steps:162
Episode: 750/1600, Reward: 19.00, Steps:19
Episode: 760/1600, Reward: 120.00, Steps:120
Episode: 770/1600, Reward: 34.00, Steps:34
Episode: 780/1600, Reward: 115.00, Steps:115
Episode: 790/1600, Reward: 66.00, Steps:66
Episode: 800/1600, Reward: 114.00, Steps:114
Episode: 810/1600, Reward: 130.00, Steps:130
Episode: 820/1600, Reward: 71.00, Steps:71
Episode: 830/1600, Reward: 52.00, Steps:52
Episode: 840/1600, Reward: 128.00, Steps:128
Episode: 850/1600, Reward: 24.00, Steps:24
Episode: 860/1600, Reward: 101.00, Steps:101
Episode: 870/1600, Reward: 39.00, Steps:39
Episode: 880/1600, Reward: 33.00, Steps:33
Episode: 890/1600, Reward: 111.00, Steps:111
Episode: 900/1600, Reward: 159.00, Steps:159
Episode: 910/1600, Reward: 131.00, Steps:131
Episode: 920/1600, Reward: 73.00, Steps:73
Episode: 930/1600, Reward: 54.00, Steps:54
Episode: 940/1600, Reward: 178.00, Steps:178
Episode: 950/1600, Reward: 200.00, Steps:200
Episode: 960/1600, Reward: 82.00, Steps:82
Episode: 970/1600, Reward: 63.00, Steps:63
Episode: 980/1600, Reward: 113.00, Steps:113
Episode: 990/1600, Reward: 68.00, Steps:68
Episode: 1000/1600, Reward: 151.00, Steps:151
Episode: 1010/1600, Reward: 160.00, Steps:160
Episode: 1020/1600, Reward: 135.00, Steps:135
Episode: 1030/1600, Reward: 135.00, Steps:135
Episode: 1040/1600, Reward: 200.00, Steps:200
Episode: 1050/1600, Reward: 200.00, Steps:200
Episode: 1060/1600, Reward: 141.00, Steps:141
Episode: 1070/1600, Reward: 101.00, Steps:101
Episode: 1080/1600, Reward: 200.00, Steps:200
Episode: 1090/1600, Reward: 191.00, Steps:191
Episode: 1100/1600, Reward: 200.00, Steps:200
Episode: 1110/1600, Reward: 89.00, Steps:89
Episode: 1120/1600, Reward: 198.00, Steps:198
Episode: 1130/1600, Reward: 162.00, Steps:162
Episode: 1140/1600, Reward: 175.00, Steps:175
Episode: 1150/1600, Reward: 149.00, Steps:149
Episode: 1160/1600, Reward: 110.00, Steps:110
Episode: 1170/1600, Reward: 200.00, Steps:200
Episode: 1180/1600, Reward: 129.00, Steps:129
Episode: 1190/1600, Reward: 161.00, Steps:161
Episode: 1200/1600, Reward: 137.00, Steps:137
Episode: 1210/1600, Reward: 200.00, Steps:200
Episode: 1220/1600, Reward: 200.00, Steps:200
Episode: 1230/1600, Reward: 200.00, Steps:200
Episode: 1240/1600, Reward: 190.00, Steps:190
Episode: 1250/1600, Reward: 166.00, Steps:166
Episode: 1260/1600, Reward: 163.00, Steps:163
Episode: 1270/1600, Reward: 127.00, Steps:127
Episode: 1280/1600, Reward: 137.00, Steps:137
Episode: 1290/1600, Reward: 60.00, Steps:60
Episode: 1300/1600, Reward: 156.00, Steps:156
Episode: 1310/1600, Reward: 97.00, Steps:97
Episode: 1320/1600, Reward: 115.00, Steps:115
Episode: 1330/1600, Reward: 200.00, Steps:200
Episode: 1340/1600, Reward: 200.00, Steps:200
Episode: 1350/1600, Reward: 200.00, Steps:200
Episode: 1360/1600, Reward: 200.00, Steps:200
Episode: 1370/1600, Reward: 200.00, Steps:200
Episode: 1380/1600, Reward: 200.00, Steps:200
Episode: 1390/1600, Reward: 200.00, Steps:200
Episode: 1400/1600, Reward: 154.00, Steps:154
Episode: 1410/1600, Reward: 174.00, Steps:174
Episode: 1420/1600, Reward: 114.00, Steps:114
Episode: 1430/1600, Reward: 157.00, Steps:157
Episode: 1440/1600, Reward: 191.00, Steps:191
Episode: 1450/1600, Reward: 65.00, Steps:65
Episode: 1460/1600, Reward: 200.00, Steps:200
Episode: 1470/1600, Reward: 200.00, Steps:200
Episode: 1480/1600, Reward: 155.00, Steps:155
Episode: 1490/1600, Reward: 107.00, Steps:107
Episode: 1500/1600, Reward: 27.00, Steps:27
Episode: 1510/1600, Reward: 200.00, Steps:200
Episode: 1520/1600, Reward: 200.00, Steps:200
Episode: 1530/1600, Reward: 132.00, Steps:132
Episode: 1540/1600, Reward: 142.00, Steps:142
Episode: 1550/1600, Reward: 99.00, Steps:99
Episode: 1560/1600, Reward: 171.00, Steps:171
Episode: 1570/1600, Reward: 172.00, Steps:172
Episode: 1580/1600, Reward: 147.00, Steps:147
Episode: 1590/1600, Reward: 182.00, Steps:182
Episode: 1600/1600, Reward: 200.00, Steps:200
训练结束 , 用时: 81.30708861351013 s
============================================================================================================================================================================================================================================================================================================
状态数: 4, 动作数: 2
开始测试智能体......
环境名: CartPole-v0, 算法名: A2C, Device: cpu
Episode: 1/20, Steps:161, Reward: 161.00
Episode: 2/20, Steps:150, Reward: 150.00
Episode: 3/20, Steps:93, Reward: 93.00
Episode: 4/20, Steps:169, Reward: 169.00
Episode: 5/20, Steps:200, Reward: 200.00
Episode: 6/20, Steps:168, Reward: 168.00
Episode: 7/20, Steps:25, Reward: 25.00
Episode: 8/20, Steps:171, Reward: 171.00
Episode: 9/20, Steps:200, Reward: 200.00
Episode: 10/20, Steps:200, Reward: 200.00
Episode: 11/20, Steps:188, Reward: 188.00
Episode: 12/20, Steps:200, Reward: 200.00
Episode: 13/20, Steps:87, Reward: 87.00
Episode: 14/20, Steps:200, Reward: 200.00
Episode: 15/20, Steps:200, Reward: 200.00
Episode: 16/20, Steps:200, Reward: 200.00
Episode: 17/20, Steps:200, Reward: 200.00
Episode: 18/20, Steps:200, Reward: 200.00
Episode: 19/20, Steps:198, Reward: 198.00
Episode: 20/20, Steps:200, Reward: 200.00
测试结束 , 用时: 28.915676593780518 s
import argparse
import datetime
import time
from collections import deque
import torch.nn.functional as F
import gym
from torch import nn
# 这里需要改成自己的RL_Utils.py文件的路径
from Python.ReinforcementLearning.EasyRL.RL_Utils import *
# 经验回放缓存区对象
class PGReplay:
def __init__(self):
self.buffer = deque()
def push(self, transitions):
self.buffer.append(transitions)
def sample(self):
batch = list(self.buffer)
return zip(*batch)
def clear(self):
self.buffer.clear()
def __len__(self):
return len(self.buffer)
# 演员:离散动作,就输出每个动作的概率分布(softmax);连续动作就直接输出动作(sigmoid)
# 评论员:输出V_{\pi}(s)
# 网络参数共享的演员-评论员网络
class ActorCriticSoftMax(nn.Module):
def __init__(self, input_dim, output_dim, actor_hidden_dim=256, critic_hidden_dim=256):
super(ActorCriticSoftMax, self).__init__()
self.critic_fc1 = nn.Linear(input_dim, critic_hidden_dim)
self.critic_fc2 = nn.Linear(critic_hidden_dim, 1)
self.actor_fc1 = nn.Linear(input_dim, actor_hidden_dim)
self.actor_fc2 = nn.Linear(actor_hidden_dim, output_dim)
def forward(self, state):
value = F.relu(self.critic_fc1(state))
value = self.critic_fc2(value)
policy_dist = F.relu(self.actor_fc1(state))
policy_dist = F.softmax(self.actor_fc2(policy_dist), dim=1)
return value, policy_dist
# A2C智能体对象
class A2C:
def __init__(self, models, memory, cfg):
self.n_actions = cfg['n_actions']
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.memory = memory
self.ac_net = models['ActorCritic'].to(self.device)
self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr=cfg['lr'])
def sample_action(self, state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action, value, dist
def predict_action(self, state):
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state)
value = value.numpy().squeeze(0)[0] # shape(value) = (1,)
action = np.random.choice(self.n_actions, p=dist.numpy().squeeze(0)) # shape(p=(n_actions,1)
return action, value, dist
def update(self, next_state, entropy):
value_pool, log_prob_pool, reward_pool = self.memory.sample()
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value, _ = self.ac_net(next_state)
returns = np.zeros_like(reward_pool)
for t in reversed(range(len(reward_pool))):
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
ac_loss = actor_loss + critic_loss + 0.001 * entropy
self.ac_optimizer.zero_grad()
ac_loss.backward()
self.ac_optimizer.step()
self.memory.clear()
def save_model(self, path):
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.ac_net.state_dict(), f"{path}/a2c_checkpoint.pt")
def load_model(self, path):
self.ac_net.load_state_dict(torch.load(f"{path}/a2c_checkpoint.pt"))
# 训练函数
def train(arg_dict, env, agent):
# 开始计时
startTime = time.time()
print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
print("开始训练智能体......")
rewards = []
steps = []
for i_ep in range(arg_dict['train_eps']):
ep_reward = 0
ep_step = 0
ep_entropy = 0
state = env.reset()
# 每次采样ep_max_steps个样本再对模型进行更新
for _ in range(arg_dict['ep_max_steps']):
# 画图
if arg_dict['train_render']:
env.render()
# 探索采样
action, value, dist = agent.sample_action(state)
# 根据动作获取下一步状态、奖励(经验)
next_state, reward, done, _ = env.step(action)
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
# 保存经验
agent.memory.push((value, log_prob, reward))
# 更新状态
state = next_state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
# 更新智能体参数
agent.update(next_state, ep_entropy)
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep + 1) % 10 == 0:
print(f'Episode: {i_ep + 1}/{arg_dict["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print('训练结束 , 用时: ' + str(time.time() - startTime) + " s")
# 关闭环境
env.close()
return {'episodes': range(len(rewards)), 'rewards': rewards}
# 测试函数
def test(arg_dict, env, agent):
startTime = time.time()
print("开始测试智能体......")
print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
rewards = []
steps = []
for i_ep in range(arg_dict['test_eps']):
ep_reward = 0
ep_step = 0
state = env.reset()
for _ in range(arg_dict['ep_max_steps']):
# 画图
if arg_dict['test_render']:
env.render()
# 预测动作
action, _, _ = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep + 1}/{arg_dict['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("测试结束 , 用时: " + str(time.time() - startTime) + " s")
env.close()
return {'episodes': range(len(rewards)), 'rewards': rewards}
# 创建环境和智能体
def create_env_agent(arg_dict):
# 创建环境
env = gym.make(arg_dict['env_name'])
# 设置随机种子
all_seed(env, seed=arg_dict["seed"])
# 获取状态数
try:
n_states = env.observation_space.n
except AttributeError:
n_states = env.observation_space.shape[0]
# 获取动作数
n_actions = env.action_space.n
print(f"状态数: {n_states}, 动作数: {n_actions}")
# 将状态数和动作数加入算法参数字典
arg_dict.update({"n_states": n_states, "n_actions": n_actions})
# 实例化智能体对象
models = {
'ActorCritic': ActorCriticSoftMax(arg_dict['n_states'], arg_dict['n_actions'],
actor_hidden_dim=arg_dict['actor_hidden_dim'],
critic_hidden_dim=arg_dict['critic_hidden_dim'])}
# 经验回放缓存区
memory = PGReplay()
agent = A2C(models, memory, arg_dict)
# 返回环境,智能体
return env, agent
if __name__ == '__main__':
# 防止报错 OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# 获取当前路径
curr_path = os.path.dirname(os.path.abspath(__file__))
# 获取当前时间
curr_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
# 相关参数设置
parser = argparse.ArgumentParser(description="hyper parameters")
parser.add_argument('--algo_name', default='A2C', type=str, help="name of algorithm")
parser.add_argument('--env_name', default='CartPole-v0', type=str, help="name of environment")
parser.add_argument('--train_eps', default=2000, type=int, help="episodes of training")
parser.add_argument('--test_eps', default=20, type=int, help="episodes of testing")
parser.add_argument('--ep_max_steps', default=100000, type=int,
help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma', default=0.99, type=float, help="discounted factor")
parser.add_argument('--lr', default=3e-4, type=float, help="learning rate")
parser.add_argument('--actor_hidden_dim', default=256, type=int)
parser.add_argument('--critic_hidden_dim', default=256, type=int)
parser.add_argument('--device', default='cpu', type=str, help="cpu or cuda")
parser.add_argument('--seed', default=520, type=int, help="seed")
parser.add_argument('--show_fig', default=False, type=bool, help="if show figure or not")
parser.add_argument('--save_fig', default=True, type=bool, help="if save figure or not")
parser.add_argument('--train_render', default=False, type=bool,
help="Whether to render the environment during training")
parser.add_argument('--test_render', default=True, type=bool,
help="Whether to render the environment during testing")
args = parser.parse_args()
default_args = {'result_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
# 将参数转化为字典 type(dict)
arg_dict = {**vars(args), **default_args}
print("算法参数字典:", arg_dict)
# 创建环境和智能体
env, agent = create_env_agent(arg_dict)
# 传入算法参数、环境、智能体,然后开始训练
res_dic = train(arg_dict, env, agent)
print("算法返回结果字典:", res_dic)
# 保存相关信息
agent.save_model(path=arg_dict['model_path'])
save_args(arg_dict, path=arg_dict['result_path'])
save_results(res_dic, tag='train', path=arg_dict['result_path'])
plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="train")
# =================================================================================================
# 创建新环境和智能体用来测试
print("=" * 300)
env, agent = create_env_agent(arg_dict)
# 加载已保存的智能体
agent.load_model(path=arg_dict['model_path'])
res_dic = test(arg_dict, env, agent)
save_results(res_dic, tag='test', path=arg_dict['result_path'])
plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="test")
运行结果展示
由于有些输出太长,下面仅展示部分输出
状态数: 4, 动作数: 2
环境名: CartPole-v0, 算法名: A2C, Device: cpu
开始训练智能体......
Episode: 10/2000, Reward: 12.00, Steps:12
Episode: 20/2000, Reward: 21.00, Steps:21
Episode: 30/2000, Reward: 13.00, Steps:13
Episode: 40/2000, Reward: 14.00, Steps:14
Episode: 50/2000, Reward: 19.00, Steps:19
Episode: 60/2000, Reward: 22.00, Steps:22
Episode: 70/2000, Reward: 50.00, Steps:50
Episode: 80/2000, Reward: 19.00, Steps:19
Episode: 90/2000, Reward: 18.00, Steps:18
Episode: 100/2000, Reward: 28.00, Steps:28
Episode: 110/2000, Reward: 20.00, Steps:20
Episode: 120/2000, Reward: 28.00, Steps:28
Episode: 130/2000, Reward: 76.00, Steps:76
Episode: 140/2000, Reward: 22.00, Steps:22
Episode: 150/2000, Reward: 70.00, Steps:70
Episode: 160/2000, Reward: 20.00, Steps:20
Episode: 170/2000, Reward: 85.00, Steps:85
Episode: 180/2000, Reward: 17.00, Steps:17
Episode: 190/2000, Reward: 49.00, Steps:49
Episode: 200/2000, Reward: 21.00, Steps:21
Episode: 210/2000, Reward: 65.00, Steps:65
Episode: 220/2000, Reward: 54.00, Steps:54
Episode: 230/2000, Reward: 85.00, Steps:85
Episode: 240/2000, Reward: 48.00, Steps:48
Episode: 250/2000, Reward: 22.00, Steps:22
Episode: 260/2000, Reward: 34.00, Steps:34
Episode: 270/2000, Reward: 22.00, Steps:22
Episode: 280/2000, Reward: 29.00, Steps:29
Episode: 290/2000, Reward: 77.00, Steps:77
Episode: 300/2000, Reward: 30.00, Steps:30
Episode: 310/2000, Reward: 115.00, Steps:115
Episode: 320/2000, Reward: 62.00, Steps:62
Episode: 330/2000, Reward: 45.00, Steps:45
Episode: 340/2000, Reward: 102.00, Steps:102
Episode: 350/2000, Reward: 93.00, Steps:93
Episode: 360/2000, Reward: 27.00, Steps:27
Episode: 370/2000, Reward: 31.00, Steps:31
Episode: 380/2000, Reward: 27.00, Steps:27
Episode: 390/2000, Reward: 30.00, Steps:30
Episode: 400/2000, Reward: 30.00, Steps:30
Episode: 410/2000, Reward: 61.00, Steps:61
Episode: 420/2000, Reward: 61.00, Steps:61
Episode: 430/2000, Reward: 56.00, Steps:56
Episode: 440/2000, Reward: 120.00, Steps:120
Episode: 450/2000, Reward: 87.00, Steps:87
Episode: 460/2000, Reward: 66.00, Steps:66
Episode: 470/2000, Reward: 30.00, Steps:30
Episode: 480/2000, Reward: 65.00, Steps:65
Episode: 490/2000, Reward: 72.00, Steps:72
Episode: 500/2000, Reward: 64.00, Steps:64
Episode: 510/2000, Reward: 93.00, Steps:93
Episode: 520/2000, Reward: 159.00, Steps:159
Episode: 530/2000, Reward: 21.00, Steps:21
Episode: 540/2000, Reward: 31.00, Steps:31
Episode: 550/2000, Reward: 126.00, Steps:126
Episode: 560/2000, Reward: 176.00, Steps:176
Episode: 570/2000, Reward: 116.00, Steps:116
Episode: 580/2000, Reward: 131.00, Steps:131
Episode: 590/2000, Reward: 156.00, Steps:156
Episode: 600/2000, Reward: 158.00, Steps:158
Episode: 610/2000, Reward: 125.00, Steps:125
Episode: 620/2000, Reward: 39.00, Steps:39
Episode: 630/2000, Reward: 52.00, Steps:52
Episode: 640/2000, Reward: 67.00, Steps:67
Episode: 650/2000, Reward: 110.00, Steps:110
Episode: 660/2000, Reward: 95.00, Steps:95
Episode: 670/2000, Reward: 33.00, Steps:33
Episode: 680/2000, Reward: 188.00, Steps:188
Episode: 690/2000, Reward: 29.00, Steps:29
Episode: 700/2000, Reward: 58.00, Steps:58
Episode: 710/2000, Reward: 60.00, Steps:60
Episode: 720/2000, Reward: 131.00, Steps:131
Episode: 730/2000, Reward: 132.00, Steps:132
Episode: 740/2000, Reward: 169.00, Steps:169
Episode: 750/2000, Reward: 189.00, Steps:189
Episode: 760/2000, Reward: 109.00, Steps:109
Episode: 770/2000, Reward: 70.00, Steps:70
Episode: 780/2000, Reward: 200.00, Steps:200
Episode: 790/2000, Reward: 157.00, Steps:157
Episode: 800/2000, Reward: 178.00, Steps:178
Episode: 810/2000, Reward: 181.00, Steps:181
Episode: 820/2000, Reward: 112.00, Steps:112
Episode: 830/2000, Reward: 28.00, Steps:28
Episode: 840/2000, Reward: 184.00, Steps:184
Episode: 850/2000, Reward: 80.00, Steps:80
Episode: 860/2000, Reward: 25.00, Steps:25
Episode: 870/2000, Reward: 148.00, Steps:148
Episode: 880/2000, Reward: 111.00, Steps:111
Episode: 890/2000, Reward: 121.00, Steps:121
Episode: 900/2000, Reward: 130.00, Steps:130
Episode: 910/2000, Reward: 190.00, Steps:190
Episode: 920/2000, Reward: 124.00, Steps:124
Episode: 930/2000, Reward: 140.00, Steps:140
Episode: 940/2000, Reward: 200.00, Steps:200
Episode: 950/2000, Reward: 86.00, Steps:86
Episode: 960/2000, Reward: 82.00, Steps:82
Episode: 970/2000, Reward: 186.00, Steps:186
Episode: 980/2000, Reward: 66.00, Steps:66
Episode: 990/2000, Reward: 200.00, Steps:200
Episode: 1000/2000, Reward: 193.00, Steps:193
Episode: 1010/2000, Reward: 200.00, Steps:200
Episode: 1020/2000, Reward: 157.00, Steps:157
Episode: 1030/2000, Reward: 150.00, Steps:150
Episode: 1040/2000, Reward: 200.00, Steps:200
Episode: 1050/2000, Reward: 200.00, Steps:200
Episode: 1060/2000, Reward: 115.00, Steps:115
Episode: 1070/2000, Reward: 108.00, Steps:108
Episode: 1080/2000, Reward: 189.00, Steps:189
Episode: 1090/2000, Reward: 126.00, Steps:126
Episode: 1100/2000, Reward: 200.00, Steps:200
Episode: 1110/2000, Reward: 200.00, Steps:200
Episode: 1120/2000, Reward: 200.00, Steps:200
Episode: 1130/2000, Reward: 200.00, Steps:200
Episode: 1140/2000, Reward: 200.00, Steps:200
Episode: 1150/2000, Reward: 200.00, Steps:200
Episode: 1160/2000, Reward: 131.00, Steps:131
Episode: 1170/2000, Reward: 191.00, Steps:191
Episode: 1180/2000, Reward: 200.00, Steps:200
Episode: 1190/2000, Reward: 200.00, Steps:200
Episode: 1200/2000, Reward: 171.00, Steps:171
Episode: 1210/2000, Reward: 200.00, Steps:200
Episode: 1220/2000, Reward: 180.00, Steps:180
Episode: 1230/2000, Reward: 127.00, Steps:127
Episode: 1240/2000, Reward: 94.00, Steps:94
Episode: 1250/2000, Reward: 113.00, Steps:113
Episode: 1260/2000, Reward: 150.00, Steps:150
Episode: 1270/2000, Reward: 200.00, Steps:200
Episode: 1280/2000, Reward: 148.00, Steps:148
Episode: 1290/2000, Reward: 111.00, Steps:111
Episode: 1300/2000, Reward: 200.00, Steps:200
Episode: 1310/2000, Reward: 77.00, Steps:77
Episode: 1320/2000, Reward: 158.00, Steps:158
Episode: 1330/2000, Reward: 200.00, Steps:200
Episode: 1340/2000, Reward: 180.00, Steps:180
Episode: 1350/2000, Reward: 142.00, Steps:142
Episode: 1360/2000, Reward: 142.00, Steps:142
Episode: 1370/2000, Reward: 147.00, Steps:147
Episode: 1380/2000, Reward: 196.00, Steps:196
Episode: 1390/2000, Reward: 200.00, Steps:200
Episode: 1400/2000, Reward: 163.00, Steps:163
Episode: 1410/2000, Reward: 159.00, Steps:159
Episode: 1420/2000, Reward: 170.00, Steps:170
Episode: 1430/2000, Reward: 200.00, Steps:200
Episode: 1440/2000, Reward: 200.00, Steps:200
Episode: 1450/2000, Reward: 200.00, Steps:200
Episode: 1460/2000, Reward: 200.00, Steps:200
Episode: 1470/2000, Reward: 200.00, Steps:200
Episode: 1480/2000, Reward: 200.00, Steps:200
Episode: 1490/2000, Reward: 200.00, Steps:200
Episode: 1500/2000, Reward: 200.00, Steps:200
Episode: 1510/2000, Reward: 200.00, Steps:200
Episode: 1520/2000, Reward: 75.00, Steps:75
Episode: 1530/2000, Reward: 200.00, Steps:200
Episode: 1540/2000, Reward: 200.00, Steps:200
Episode: 1550/2000, Reward: 200.00, Steps:200
Episode: 1560/2000, Reward: 189.00, Steps:189
Episode: 1570/2000, Reward: 194.00, Steps:194
Episode: 1580/2000, Reward: 200.00, Steps:200
Episode: 1590/2000, Reward: 164.00, Steps:164
Episode: 1600/2000, Reward: 200.00, Steps:200
Episode: 1610/2000, Reward: 200.00, Steps:200
Episode: 1620/2000, Reward: 161.00, Steps:161
Episode: 1630/2000, Reward: 200.00, Steps:200
Episode: 1640/2000, Reward: 135.00, Steps:135
Episode: 1650/2000, Reward: 159.00, Steps:159
Episode: 1660/2000, Reward: 115.00, Steps:115
Episode: 1670/2000, Reward: 197.00, Steps:197
Episode: 1680/2000, Reward: 200.00, Steps:200
Episode: 1690/2000, Reward: 200.00, Steps:200
Episode: 1700/2000, Reward: 157.00, Steps:157
Episode: 1710/2000, Reward: 190.00, Steps:190
Episode: 1720/2000, Reward: 127.00, Steps:127
Episode: 1730/2000, Reward: 64.00, Steps:64
Episode: 1740/2000, Reward: 178.00, Steps:178
Episode: 1750/2000, Reward: 130.00, Steps:130
Episode: 1760/2000, Reward: 142.00, Steps:142
Episode: 1770/2000, Reward: 108.00, Steps:108
Episode: 1780/2000, Reward: 99.00, Steps:99
Episode: 1790/2000, Reward: 130.00, Steps:130
Episode: 1800/2000, Reward: 147.00, Steps:147
Episode: 1810/2000, Reward: 200.00, Steps:200
Episode: 1820/2000, Reward: 60.00, Steps:60
Episode: 1830/2000, Reward: 200.00, Steps:200
Episode: 1840/2000, Reward: 93.00, Steps:93
Episode: 1850/2000, Reward: 163.00, Steps:163
Episode: 1860/2000, Reward: 189.00, Steps:189
Episode: 1870/2000, Reward: 200.00, Steps:200
Episode: 1880/2000, Reward: 200.00, Steps:200
Episode: 1890/2000, Reward: 200.00, Steps:200
Episode: 1900/2000, Reward: 200.00, Steps:200
Episode: 1910/2000, Reward: 200.00, Steps:200
Episode: 1920/2000, Reward: 200.00, Steps:200
Episode: 1930/2000, Reward: 200.00, Steps:200
Episode: 1940/2000, Reward: 102.00, Steps:102
Episode: 1950/2000, Reward: 106.00, Steps:106
Episode: 1960/2000, Reward: 200.00, Steps:200
Episode: 1970/2000, Reward: 200.00, Steps:200
Episode: 1980/2000, Reward: 200.00, Steps:200
Episode: 1990/2000, Reward: 200.00, Steps:200
Episode: 2000/2000, Reward: 200.00, Steps:200
训练结束 , 用时: 129.54206490516663 s
状态数: 4, 动作数: 2
开始测试智能体......
环境名: CartPole-v0, 算法名: A2C, Device: cpu
Episode: 1/20, Steps:130, Reward: 130.00
Episode: 2/20, Steps:200, Reward: 200.00
Episode: 3/20, Steps:200, Reward: 200.00
Episode: 4/20, Steps:200, Reward: 200.00
Episode: 5/20, Steps:200, Reward: 200.00
Episode: 6/20, Steps:200, Reward: 200.00
Episode: 7/20, Steps:87, Reward: 87.00
Episode: 8/20, Steps:200, Reward: 200.00
Episode: 9/20, Steps:68, Reward: 68.00
Episode: 10/20, Steps:200, Reward: 200.00
Episode: 11/20, Steps:62, Reward: 62.00
Episode: 12/20, Steps:200, Reward: 200.00
Episode: 13/20, Steps:200, Reward: 200.00
Episode: 14/20, Steps:200, Reward: 200.00
Episode: 15/20, Steps:200, Reward: 200.00
Episode: 16/20, Steps:200, Reward: 200.00
Episode: 17/20, Steps:200, Reward: 200.00
Episode: 18/20, Steps:200, Reward: 200.00
Episode: 19/20, Steps:200, Reward: 200.00
Episode: 20/20, Steps:200, Reward: 200.00
测试结束 , 用时: 27.40801215171814 s
如果你觉得可视化比较耗时,你可以进行设置,取消可视化。
或者你想看看训练过程的可视化,也可以进行相关设置