在第七篇笔记(https://blog.csdn.net/hhy_csdn/article/details/106435472?spm=1001.2014.3001.5501)中,实现了Vanilla Policy Gradient算法建模action的高斯分布的情况,用以实现连续动作空间任务的控制。
但是高斯分布定义的action有时不能适应全部情况,例如action space有明确取值区间,高斯采样超出区间,就必须clip,毫无疑问会给policy gradient的估计产生影响。
采用Beta分布可以解决这个问题。Beta分布是定义在 [ 0 , 1 ] [0,1] [0,1]区间上的连续分布,受 a > 0 , b > 0 a>0,b>0 a>0,b>0两个参数的控制。有了 [ 0 , 1 ] [0,1] [0,1]区间上的连续分布,我们就能通过线性变换,得到任意封闭区间上的连续分布了。
下面是实验代码。
import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
from configuration import config
from CartPoleContinuous import CartPoleContinuousEnv
import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='CartPoleContinuous') # 'CartPole-v0'
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--exploration_end', type=int, default=100, metavar='N', #
help='number of episodes with noise (default: 100)')
parser.add_argument('--seed', type=int, default=config.seed, metavar='N', # 随机数种子
help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N', # 一个episode最长持续帧数
help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=2000, metavar='N', # 训练episode数量
help='number of episodes (default: 2000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N', # 神经网络隐层神经元数量
help='number of episodes (default: 128)')
parser.add_argument('--render', action='store_true',
help='render the environment')
parser.add_argument('--ckpt_freq', type=int, default=100,
help='model saving frequency')
parser.add_argument('--display', type=bool, default=False,
help='display or not')
args = parser.parse_args()
env_name = args.env_name # 游戏名
if env_name == 'CartPoleContinuous':
env = CartPoleContinuousEnv()
else:
env = gym.make(env_name) # 创建环境
if args.display:
env = wrappers.Monitor(env, './result/policygradient/{}-experiment'.format(env_name), force=True)
env.seed(args.seed) # 随机数种子
torch.manual_seed(args.seed) # Gym、numpy、Pytorch都要设置随机数种子
np.random.seed(args.seed)
class Policy(nn.Module): # 神经网络定义的策略
def __init__(self, hidden_size, num_inputs, action_space):
super(Policy, self).__init__()
self.action_space = action_space # 动作空间
num_outputs = action_space.shape[0] # 动作空间的维度
self.linear1 = nn.Linear(num_inputs, hidden_size) # 隐层神经元数量
self.linear2 = nn.Linear(hidden_size, num_outputs)
self.linear2_ = nn.Linear(hidden_size, num_outputs)
def forward(self, inputs):
x = inputs
x = F.relu(self.linear1(x))
a = F.softplus(self.linear2(x)) # 为了输出连续域动作,实际上policy net定义了
b = F.softplus(self.linear2_(x)) # 一个多维Beta分布,维度=动作空间的维度
return a, b
class REINFORCE:
def __init__(self, hidden_size, num_inputs, action_space):
self.action_space = action_space
self.model = Policy(hidden_size, num_inputs, action_space) # 创建策略网络
# self.model = self.model.cuda() # GPU版本
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2) # 优化器
self.model.train()
def select_action(self, state):
# mu, sigma_sq = self.model(Variable(state).cuda())
a, b = self.model(Variable(state))
beta = torch.distributions.Beta(a,b)
sample = beta.sample()
action = (sample*2 - 1).item() # 定义域[-1,1]
log_prob = beta.log_prob(sample)
entropy = beta.entropy()
return action, log_prob, entropy
def update_parameters(self, rewards, log_probs, entropies, gamma):# 更新参数
R = torch.zeros(1, 1)
loss = 0
for i in reversed(range(len(rewards))):
R = gamma * R + rewards[i] # 倒序计算累计期望
# loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i]))).sum() - (0.001*entropies[i]).sum()
loss = loss / len(rewards)
self.optimizer.zero_grad()
loss.backward()
utils.clip_grad_norm_(self.model.parameters(), 10) # 梯度裁剪,梯度的最大L2范数=40
self.optimizer.step()
agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space)
dir = './results/ckpt_' + env_name
if not os.path.exists(dir):
os.mkdir(dir)
for i_episode in range(args.num_episodes):
state = torch.Tensor([env.reset()])
entropies = []
log_probs = []
rewards = []
for t in range(args.num_steps): # 1个episode最长持续的timestep
action, log_prob, entropy = agent.select_action(state)
next_state, reward, done, _ = env.step(np.array([action]))
entropies.append(entropy)
log_probs.append(log_prob)
rewards.append(reward)
state = torch.Tensor([next_state])
if done:
break
# 1局游戏结束后开始更新参数
agent.update_parameters(rewards, log_probs, entropies, args.gamma)
if i_episode%args.ckpt_freq == 0:
torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))
print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
env.close()
测试用的连续动作空间环境CartPoleContinuous
import math
import numpy as np
from gym import spaces, logger
from gym.envs.classic_control import CartPoleEnv
class CartPoleContinuousEnv(CartPoleEnv):
def __init__(self):
super().__init__()
# direction & scale of force magnitude.
self.min_action = np.float32(-1.0)
self.max_action = np.float32(1.0)
self.action_space = spaces.Box(low=self.min_action, high=self.max_action, shape=(1,), )
def step(self, action):
assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
# Discrete Case:(just for reference)
# force = self.force_mag if action == 1 else -self.force_mag
# Continuous Case:
force = self.force_mag * action[0]
# Note: everything below this is same as gym's cartpole step fun.
state = self.state
x, x_dot, theta, theta_dot = state
costheta = math.cos(theta)
sintheta = math.sin(theta)
temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
thetaacc = (self.gravity * sintheta - costheta * temp) / (
self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass))
xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
if self.kinematics_integrator == 'euler':
x = x + self.tau * x_dot
x_dot = x_dot + self.tau * xacc
theta = theta + self.tau * theta_dot
theta_dot = theta_dot + self.tau * thetaacc
else: # semi-implicit euler
x_dot = x_dot + self.tau * xacc
x = x + self.tau * x_dot
theta_dot = theta_dot + self.tau * thetaacc
theta = theta + self.tau * theta_dot
self.state = (x, x_dot, theta, theta_dot)
done = x < -self.x_threshold \
or x > self.x_threshold \
or theta < -self.theta_threshold_radians \
or theta > self.theta_threshold_radians
done = bool(done)
if not done:
reward = 1.0
elif self.steps_beyond_done is None:
# Pole just fell!
self.steps_beyond_done = 0
reward = 1.0
else:
if self.steps_beyond_done == 0:
logger.warn("You are calling 'step()' even though this environment has already returned done = True."
" You should always call 'reset()' once you receive 'done = True' -- any further steps "
"are undefined behavior.")
self.steps_beyond_done += 1
reward = 0.0
return np.array(self.state), reward, done, {}