在文章强化学习经典算法笔记(七):策略梯度算法Policy Gradient中介绍了连续动作空间的Policy Gradient算法,最近需要用到离散空间的版本,遂写了个CartPole-v1上的PG代码。
相比于原来的PG,改动主要在select_action
函数和update_parameters
函数。在原来的版本中,由于动作是一个二维连续向量,所以动作的对数概率也是一个二维向量,动作熵也是二维向量,而CartPole环境就要做一定修改。
另外,vanilla PG真的很不稳定,跟股市波动有一拼。performance比较依赖调参的。
import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils
from torch.distributions import Categorical
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='CartPole-v1')
parser.add_argument('--gamma', type=float, default=0.98, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--seed', type=int, default=1234, metavar='N', # 随机数种子
help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N', # 一个episode最长持续帧数
help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=1000, metavar='N', # 训练episode数量
help='number of episodes (default: 1000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N', # 神经网络隐层神经元数量
help='number of episodes (default: 128)')
parser.add_argument('--render', action='store_true',
help='render the environment')
parser.add_argument('--ckpt_freq', type=int, default=100,
help='model saving frequency')
parser.add_argument('--display', type=bool, default=False,
help='display or not')
args = parser.parse_args()
env_name = args.env_name # 游戏名
env = gym.make(env_name) # 创建环境
env = env.unwrapped
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n
if args.display:
env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
env.seed(args.seed) # 随机数种子
torch.manual_seed(args.seed) # Gym、numpy、Pytorch都要设置随机数种子
np.random.seed(args.seed)
class Policy(nn.Module): # 神经网络定义的策略
def __init__(self, hidden_size, s_dim, a_dim):
super(Policy, self).__init__()
self.linear1 = nn.Linear(s_dim, hidden_size) # 隐层神经元数量
self.linear2 = nn.Linear(hidden_size, a_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
p = F.softmax(self.linear2(x),-1) # 离散动作
return p
class REINFORCE:
def __init__(self, hidden_size, s_dim, a_dim):
self.model = Policy(hidden_size, s_dim, a_dim) # 创建策略网络
# self.model = self.model.cuda() # GPU版本
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2) # 优化器
self.model.train()
self.pi = Variable(torch.FloatTensor([math.pi])) # .cuda() # 圆周率
def select_action(self, state):
# mu, sigma_sq = self.model(Variable(state).cuda())
prob = self.model(Variable(state))
dist = Categorical(probs=prob)
action = dist.sample()
log_prob = prob[0,action.item()].log()
# log_prob = prob.log()
entropy = dist.entropy()
return action, log_prob, entropy
def update_parameters(self, rewards, log_probs, entropies, gamma):# 更新参数
R = torch.tensor(0)
loss = 0
for i in reversed(range(len(rewards))):
R = gamma * R + rewards[i] # 倒序计算累计期望
# loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
#print(log_probs[i],Variable(R))
loss = loss - (log_probs[i]*Variable(R)) - 0.005*entropies[i][0]
loss = loss / len(rewards)
self.optimizer.zero_grad()
loss.backward()
utils.clip_grad_norm_(self.model.parameters(), 2) # 梯度裁剪,梯度的最大L2范数=40
self.optimizer.step()
agent = REINFORCE(args.hidden_size,s_dim,a_dim)
dir = 'ckpt_' + env_name
if not os.path.exists(dir):
os.mkdir(dir)
log_reward = []
log_smooth = []
for i_episode in range(args.num_episodes):
state = torch.Tensor([env.reset()])
entropies = []
log_probs = []
rewards = []
for t in range(args.num_steps): # 1个episode最长num_steps
action, log_prob, entropy = agent.select_action(state)
action = action.cpu()
next_state, reward, done, _ = env.step(action.numpy()[0])
entropies.append(entropy)
log_probs.append(log_prob)
rewards.append(reward)
state = torch.Tensor([next_state])
if done:
break
# episode结束,开始训练
agent.update_parameters(rewards, log_probs, entropies, args.gamma)
if i_episode%args.ckpt_freq == 0:
torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))
print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
log_reward.append(np.sum(rewards))
if i_episode == 0:
log_smooth.append(log_reward[-1])
else:
log_smooth.append(log_smooth[-1]*0.99+0.01*np.sum(rewards))
plt.plot(log_reward)
plt.plot(log_smooth)
plt.pause(1e-5)
env.close()