想象二维平面上的一个滚球,对它施加水平和竖直方向的两个力,滚球就会在加速度作用下运动起来,当球碰到平面边缘时会发生完全弹性碰撞,我们希望滚球在力的作用下尽快到达目标位置
此环境的状态空间为
维度 | 意义 | 取值范围 |
---|---|---|
0 | 滚球 x 轴坐标 | [ 0 , width ] [0,\space \text{width}] [0, width] |
1 | 滚球 y 轴坐标 | [ 0 , height ] [0,\space \text{height}] [0, height] |
2 | 滚球 x 轴速度 | [ − 5.0 , 5.0 ] [-5.0,\space 5.0] [−5.0, 5.0] |
3 | 滚球 y 轴速度 | [ − 5.0 , 5.0 ] [-5.0,\space 5.0] [−5.0, 5.0] |
动作空间为
维度 | 意义 | 取值范围 |
---|---|---|
0 | 施加在滚球 x 轴方向的力 | [ − 1.0 , 1.0 ] [-1.0,\space 1.0] [−1.0, 1.0] |
1 | 施加在滚球 y 轴方向的力 | [ − 1.0 , 1.0 ] [-1.0,\space 1.0] [−1.0, 1.0] |
奖励函数为
事件 | 奖励值 |
---|---|
到达目标位置 | 300.0 300.0 300.0 |
发生反弹 | − 10.0 -10.0 −10.0 |
移动一步 | − 2.0 -2.0 −2.0 |
再增加一种 “密集奖励” 模式,除了以上奖励外,每一步运动都基于 “当前位置和目标位置的欧式距离 d d d” 设置辅助奖励 exp ( 1 / d ) \exp(1/d) exp(1/d),这可以给予 agent 更强的指导
环境完整代码如下
import gym
from gym import spaces
import numpy as np
import pygame
import time
class RollingBall(gym.Env):
metadata = {"render_modes": ["human", "rgb_array"], # 支持的渲染模式,'rgb_array' 仅用于手动交互
"render_fps": 500,} # 渲染帧率
def __init__(self, render_mode="human", width=10, height=10, show_epi=False, reward_type='sparse'):
self.max_speed = 5.0
self.width = width
self.height = height
self.show_epi = show_epi
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float64)
self.observation_space = spaces.Box(low=np.array([0.0, 0.0, -self.max_speed, -self.max_speed]),
high=np.array([width, height, self.max_speed, self.max_speed]),
dtype=np.float64)
self.velocity = np.zeros(2, dtype=np.float64)
self.mass = 0.005
self.time_step = 0.01
# 奖励参数
self.reward_type = reward_type # dense or sparse
self.rewards = {'step':-2.0, 'bounce':-10.0, 'goal':300.0}
# 起止位置
self.target_position = np.array([self.width*0.8, self.height*0.8], dtype=np.float32)
self.start_position = np.array([width*0.2, height*0.2], dtype=np.float64)
self.position = self.start_position.copy()
# 渲染相关
self.render_width = 300
self.render_height = 300
self.scale = self.render_width / self.width
self.window = None
# 用于存储滚球经过的轨迹
self.trajectory = []
# 渲染模式支持 'human' 或 'rgb_array'
assert render_mode is None or render_mode in self.metadata["render_modes"]
self.render_mode = render_mode
# 渲染模式为 render_mode == 'human' 时用于渲染窗口的组件
self.window = None
self.clock = None
def _get_obs(self):
return np.hstack((self.position, self.velocity))
def _get_info(self):
return {}
def step(self, action):
# 计算加速度
#force = action * self.mass
acceleration = action / self.mass
# 更新速度和位置
self.velocity += acceleration * self.time_step
self.velocity = np.clip(self.velocity, -self.max_speed, self.max_speed)
self.position += self.velocity * self.time_step
# 计算奖励
assert self.reward_type in ['sparse', 'dense']
reward = self.rewards['step']
if self.reward_type == 'dense':
distance = np.linalg.norm(self.position - self.target_position)
reward += np.exp(1.0/distance)
# 处理边界碰撞
reward = self._handle_boundary_collision(reward)
# 检查是否到达目标状态
terminated, truncated = False, False
if self._is_goal_reached():
terminated = True
reward += self.rewards['goal'] # 到达目标状态的奖励
obs, info = self._get_obs(), self._get_info()
self.trajectory.append(obs.copy()) # 记录滚球轨迹
return obs, reward, terminated, truncated, info
def reset(self, seed=None, options=None):
# 通过 super 初始化并使用基类的 self.np_random 随机数生成器
super().reset(seed=seed)
# 重置滚球位置、速度、轨迹
self.position = self.start_position.copy()
self.velocity = np.zeros(2, dtype=np.float64)
self.trajectory = []
return self._get_obs(), self._get_info()
def _handle_boundary_collision(self, reward):
if self.position[0] <= 0:
self.position[0] = 0
self.velocity[0] *= -1
reward += self.rewards['bounce']
elif self.position[0] >= self.width:
self.position[0] = self.width
self.velocity[0] *= -1
reward += self.rewards['bounce']
if self.position[1] <= 0:
self.position[1] = 0
self.velocity[1] *= -1
reward += self.rewards['bounce']
elif self.position[1] >= self.height:
self.position[1] = self.height
self.velocity[1] *= -1
reward += self.rewards['bounce']
return reward
def _is_goal_reached(self):
# 检查是否到达目标状态(例如,滚球到达特定位置)
distance = np.linalg.norm(self.position - self.target_position)
return distance < 1.0 # 判断距离是否小于阈值
def render(self):
if self.render_mode not in ["rgb_array", "human"]:
raise False
self._render_frame()
def _render_frame(self):
canvas = pygame.Surface((self.render_width, self.render_height))
canvas.fill((255, 255, 255)) # 背景白色
if self.window is None and self.render_mode == "human":
pygame.init()
pygame.display.init()
self.window = pygame.display.set_mode((self.render_width, self.render_height))
if self.clock is None and self.render_mode == "human":
self.clock = pygame.time.Clock()
# 绘制目标位置
target_position_render = self._convert_to_render_coordinate(self.target_position)
pygame.draw.circle(canvas, (100, 100, 200), target_position_render, 20)
# 绘制球的位置
ball_position_render = self._convert_to_render_coordinate(self.position)
pygame.draw.circle(canvas, (0, 0, 255), ball_position_render, 10)
# 绘制滚球轨迹
if self.show_epi:
for i in range(len(self.trajectory)-1):
position_from = self.trajectory[i]
position_to = self.trajectory[i+1]
position_from = self._convert_to_render_coordinate(position_from)
position_to = self._convert_to_render_coordinate(position_to)
color = int(230 * (i / len(self.trajectory))) # 根据轨迹时间确定颜色深浅
pygame.draw.lines(canvas, (color, color, color), False, [position_from, position_to], width=3)
# 'human' 渲染模式下会弹出窗口
if self.render_mode == "human":
# The following line copies our drawings from `canvas` to the visible window
self.window.blit(canvas, canvas.get_rect())
pygame.event.pump()
pygame.display.update()
# We need to ensure that human-rendering occurs at the predefined framerate.
# The following line will automatically add a delay to keep the framerate stable.
self.clock.tick(self.metadata["render_fps"])
# 'rgb_array' 渲染模式下画面会转换为像素 ndarray 形式返回,适用于用 CNN 进行状态观测的情况,为避免影响观测不要渲染价值颜色和策略
else:
return np.transpose(np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2))
def close(self):
if self.window is not None:
pygame.quit()
def _convert_to_render_coordinate(self, position):
return int(position[0] * self.scale), int(self.render_height - position[1] * self.scale)
本文讨论的 REINFORCE 和基础 Actor-Critic 方法都只能用于离散动作空间,我们进一步编写动作包装类,将原生的二维连续动作离散化并拉平为一维离散动作空间
class DiscreteActionWrapper(gym.ActionWrapper):
''' 将 RollingBall 环境的二维连续动作空间离散化为二维离散动作空间 '''
def __init__(self, env, bins):
super().__init__(env)
bin_width = 2.0 / bins
self.action_space = spaces.MultiDiscrete([bins, bins])
self.action_mapping = {i : -1+(i+0.5)*bin_width for i in range(bins)}
def action(self, action):
# 用向量化函数实现高效 action 映射
vectorized_func = np.vectorize(lambda x: self.action_mapping[x])
result = vectorized_func(action)
action = np.array(result)
return action
class FlattenActionSpaceWrapper(gym.ActionWrapper):
''' 将多维离散动作空间拉平成一维动作空间 '''
def __init__(self, env):
super(FlattenActionSpaceWrapper, self).__init__(env)
new_size = 1
for dim in self.env.action_space.nvec:
new_size *= dim
self.action_space = spaces.Discrete(new_size)
def action(self, action):
orig_action = []
for dim in reversed(self.env.action_space.nvec):
orig_action.append(action % dim)
action //= dim
orig_action.reverse()
return np.array(orig_action)
随机策略测试代码
import os
import sys
base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(base_path)
import numpy as np
import time
from gym.utils.env_checker import check_env
from environment.Env_RollingBall import RollingBall, DiscreteActionWrapper, FlattenActionSpaceWrapper
from gym.wrappers import TimeLimit
env = RollingBall(render_mode='human', width=5, height=5, show_epi=True)
env = FlattenActionSpaceWrapper(DiscreteActionWrapper(env, 5))
env = TimeLimit(env, 100)
check_env(env.unwrapped) # 检查环境是否符合 gym 规范
env.action_space.seed(10)
observation, _ = env.reset(seed=10)
# 测试环境
for i in range(100):
while True:
action = env.action_space.sample()
#action = 19
state, reward, terminated, truncated, _ = env.step(action)
if terminated or truncated:
env.reset()
break
time.sleep(0.01)
env.render()
# 关闭环境渲染
env.close()
强化学习方法总体上可以分成 Value-based 和 policy-based 两类
Value-based 类方法的基本思想都是学习价值函数,然后从中导出一个策略,学习过程中并不存在一个显式的策略。我们已经实践了很多 Value-base
类方法,包括
从本文开始我们把重点放在 Policy-Gradient
类方法上,这类方法会显式地学习一个目标策略,其基本思想是把策略学习描述成一个最优化问题,然后通过梯度下降(梯度上升)求解。这里的梯度就是所谓的 策略梯度
,策略梯度无法精确求解,两种近似方案分别衍生出 REINFORCE
算法和 Actor-Critic
类算法,其中后者成为了一个经典的算法框架,在 RL 的各个分支中都得到了广泛应用
策略梯度
就是策略梯度定理
计算下面进行证明,从状态价值函数的推导开始
∇ θ V π θ ( s ) = ∇ θ ( ∑ a ∈ A π θ ( a ∣ s ) Q π θ ( s , a ) ) = ∑ a ∈ A ( ∇ θ π θ ( a ∣ s ) Q π θ ( s , a ) + π θ ( a ∣ s ) ∇ θ Q π θ ( s , a ) ) = ∑ a ∈ A ( ∇ θ π θ ( a ∣ s ) Q π θ ( s , a ) + π θ ( a ∣ s ) ∇ θ ∑ s ′ , r p ( s ′ , r ∣ s , a ) ( r + γ V π θ ( s ′ ) ) ) . = ∑ a ∈ A ( ∇ θ π θ ( a ∣ s ) Q π θ ( s , a ) + γ π θ ( a ∣ s ) ∑ s ′ , r p ( s ′ , r ∣ s , a ) ∇ θ V π θ ( s ′ ) ) = ∑ a ∈ A ( ∇ θ π θ ( a ∣ s ) Q π θ ( s , a ) + γ π θ ( a ∣ s ) ∑ s ′ p ( s ′ ∣ s , a ) ∇ θ V π θ ( s ′ ) ) \begin{aligned} \nabla_{\theta} V_{\pi_{\theta}}(s) & =\nabla_{\theta}\left(\sum_{a \in A} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a)\right) \\ & =\sum_{a \in A}\left(\nabla_{\theta} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a)+\pi_{\theta}(a \mid s) \nabla_{\theta} Q_{\pi_{\theta}}(s, a)\right) \\ & =\sum_{a \in A}\left(\nabla_{\theta} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a)+\pi_{\theta}(a \mid s) \nabla_{\theta} \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left(r+\gamma V_{\pi_{\theta}}\left(s^{\prime}\right)\right)\right). \\ & =\sum_{a \in A}\left(\nabla_{\theta} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a)+\gamma \pi_{\theta}(a \mid s) \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime}\right)\right) \\ & =\sum_{a \in A}\left(\nabla_{\theta} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a)+\gamma \pi_{\theta}(a \mid s) \sum_{s^{\prime}} p\left(s^{\prime} \mid s, a\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime}\right)\right) \end{aligned} ∇θVπθ(s)=∇θ(a∈A∑πθ(a∣s)Qπθ(s,a))=a∈A∑(∇θπθ(a∣s)Qπθ(s,a)+πθ(a∣s)∇θQπθ(s,a))=a∈A∑ ∇θπθ(a∣s)Qπθ(s,a)+πθ(a∣s)∇θs′,r∑p(s′,r∣s,a)(r+γVπθ(s′)) .=a∈A∑ ∇θπθ(a∣s)Qπθ(s,a)+γπθ(a∣s)s′,r∑p(s′,r∣s,a)∇θVπθ(s′) =a∈A∑(∇θπθ(a∣s)Qπθ(s,a)+γπθ(a∣s)s′∑p(s′∣s,a)∇θVπθ(s′)) 为了简化表示,我们让 ϕ ( s ) = ∑ a ∈ A ∇ θ π θ ( a ∣ s ) Q π θ ( s , a ) \phi(s)=\sum_{a \in A} \nabla_{\theta} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a) ϕ(s)=∑a∈A∇θπθ(a∣s)Qπθ(s,a),定义 d π θ ( s → x ; k ) d_{\pi_\theta}(s\to x;k) dπθ(s→x;k) 为策略 π \pi π 从状态 x x x 出发步后经过 k k k 步到达状态 x x x 的概率(这里要求马尔科夫链具有稳态分布)。继续推导
∇ θ V π θ ( s ) = ϕ ( s ) + γ ∑ a π θ ( a ∣ s ) ∑ s ′ P ( s ′ ∣ s , a ) ∇ θ V π θ ( s ′ ) = ϕ ( s ) + γ ∑ a ∑ s ′ π θ ( a ∣ s ) P ( s ′ ∣ s , a ) ∇ θ V π θ ( s ′ ) = ϕ ( s ) + γ ∑ s ′ d π θ ( s → s ′ , 1 ) ∇ θ V π θ ( s ′ ) = ϕ ( s ) + γ ∑ s ′ s ′ d π θ ( s → s ′ , 1 ) [ ϕ ( s ′ ) + γ ∑ s ′ ′ d π θ ( s ′ → s ′ ′ , 1 ) ∇ θ V π θ ( s ′ ′ ) ] = ϕ ( s ) + γ ∑ s ′ s π θ ( s → s ′ , 1 ) ϕ ( s ′ ) + γ 2 ∑ s ′ ′ d π θ ( s → s ′ ′ , 2 ) ∇ θ V π θ ( s ′ ′ ) = ϕ ( s ) + γ ∑ s ′ π ′ ( s → s ′ , 1 ) ϕ ( s ′ ) + γ 2 ∑ s ′ ′ d π θ ( s ′ → s ′ ′ , 2 ) ϕ ( s ′ ′ ) + γ 3 ∑ s ′ ′ ′ d π θ ( s → s ′ ′ ′ , 3 ) ∇ θ V π θ ( s ′ ′ ′ ) = ⋯ = ∑ x ∈ S ∑ k = 0 ∞ γ k d π θ ( s → x , k ) ϕ ( x ) \begin{aligned} \nabla_{\theta} V_{\pi_{\theta}}(s) & =\phi(s)+\gamma \sum_{a} \pi_{\theta}(a \mid s) \sum_{s^{\prime}} P\left(s^{\prime} \mid s, a\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime}\right) \\ & =\phi(s)+\gamma \sum_{a} \sum_{s^{\prime}} \pi_{\theta}(a \mid s) P\left(s^{\prime} \mid s, a\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime}\right) \\ & =\phi(s)+\gamma \sum_{s^{\prime}} d_{\pi_{\theta}}\left(s \rightarrow s^{\prime}, 1\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime}\right) \\ & =\phi(s)+\gamma \sum_{s^{\prime}}^{s^{\prime}} d_{\pi_{\theta}}\left(s \rightarrow s^{\prime}, 1\right)\left[\phi\left(s^{\prime}\right)+\gamma \sum_{s^{\prime \prime}} d_{\pi_{\theta}}\left(s^{\prime} \rightarrow s^{\prime \prime}, 1\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime \prime}\right)\right] \\ & =\phi(s)+\gamma \sum_{s^{\prime}}^{s_{\pi_{\theta}}}\left(s \rightarrow s^{\prime}, 1\right) \phi\left(s^{\prime}\right)+\gamma^{2} \sum_{s^{\prime \prime}} d_{\pi_{\theta}}\left(s \rightarrow s^{\prime \prime}, 2\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime \prime}\right) \\ & =\phi(s)+\gamma \sum_{s^{\prime}}^{\pi^{\prime}}\left(s \rightarrow s^{\prime}, 1\right) \phi\left(s^{\prime}\right)+\gamma^{2} \sum_{s^{\prime \prime}} d_{\pi_{\theta}}\left(s^{\prime} \rightarrow s^{\prime \prime}, 2\right) \phi\left(s^{\prime \prime}\right)+\gamma^{3} \sum_{s^{\prime \prime \prime}} d_{\pi_{\theta}}\left(s \rightarrow s^{\prime \prime \prime}, 3\right) \nabla_{\theta} V_{\pi_{\theta}}\left(s^{\prime \prime \prime}\right) \\ & =\cdots \\ & =\sum_{x \in S} \sum_{k=0}^{\infty} \gamma^{k} d_{\pi_{\theta}}(s \rightarrow x, k) \phi(x) \end{aligned} ∇θVπθ(s)=ϕ(s)+γa∑πθ(a∣s)s′∑P(s′∣s,a)∇θVπθ(s′)=ϕ(s)+γa∑s′∑πθ(a∣s)P(s′∣s,a)∇θVπθ(s′)=ϕ(s)+γs′∑dπθ(s→s′,1)∇θVπθ(s′)=ϕ(s)+γs′∑s′dπθ(s→s′,1)[ϕ(s′)+γs′′∑dπθ(s′→s′′,1)∇θVπθ(s′′)]=ϕ(s)+γs′∑sπθ(s→s′,1)ϕ(s′)+γ2s′′∑dπθ(s→s′′,2)∇θVπθ(s′′)=ϕ(s)+γs′∑π′(s→s′,1)ϕ(s′)+γ2s′′∑dπθ(s′→s′′,2)ϕ(s′′)+γ3s′′′∑dπθ(s→s′′′,3)∇θVπθ(s′′′)=⋯=x∈S∑k=0∑∞γkdπθ(s→x,k)ϕ(x) 定义 “策略 π θ \pi_\theta πθ 诱导的一条无限长轨迹中状态 s s s 出现的次数的期望” 为 η ( s ) = E s 0 [ ∑ k = 0 ∞ γ k d π θ ( s 0 → s , k ) ] \eta(s)=\mathbb{E}_{s_{0}}\left[\sum_{k=0}^{\infty} \gamma^{k} d_{\pi_{\theta}}\left(s_{0} \rightarrow s, k\right)\right] η(s)=Es0[∑k=0∞γkdπθ(s0→s,k)]
∇ θ J ( θ ) = ∇ θ E s 0 [ V π θ ( s 0 ) ] = ∑ s E s 0 [ ∑ k = 0 ∞ γ k d π θ ( s 0 → s , k ) ] ϕ ( s ) = ∑ s η ( s ) ϕ ( s ) = ( ∑ s η ( s ) ) ∑ s η ( s ) ∑ s η ( s ) ϕ ( s ) ∝ ∑ s η ( s ) ∑ s η ( s ) ϕ ( s ) = ∑ s ν π θ ( s ) ∑ a Q π θ ( s , a ) ∇ θ π θ ( a ∣ s ) \begin{aligned} \nabla_{\theta} J(\theta) & =\nabla_{\theta} \mathbb{E}_{s_{0}}\left[V_{\pi_{\theta}}\left(s_{0}\right)\right] \\ & =\sum_{s} \mathbb{E}_{s_{0}}\left[\sum_{k=0}^{\infty} \gamma^{k} d_{\pi_{\theta}}\left(s_{0} \rightarrow s, k\right)\right] \phi(s) \\ & =\sum_{s} \eta(s) \phi(s) \\ & =\left(\sum_{s} \eta(s)\right) \sum_{s} \frac{\eta(s)}{\sum_{s} \eta(s)} \phi(s) \\ & \propto \sum_{s} \frac{\eta(s)}{\sum_{s} \eta(s)} \phi(s) \\ & =\sum_{s} \nu_{\pi_{\theta}}(s) \sum_{a} Q_{\pi_{\theta}}(s, a) \nabla_{\theta} \pi_{\theta}(a \mid s) \end{aligned} ∇θJ(θ)=∇θEs0[Vπθ(s0)]=s∑Es0[k=0∑∞γkdπθ(s0→s,k)]ϕ(s)=s∑η(s)ϕ(s)=(s∑η(s))s∑∑sη(s)η(s)ϕ(s)∝s∑∑sη(s)η(s)ϕ(s)=s∑νπθ(s)a∑Qπθ(s,a)∇θπθ(a∣s) 其中 ν π θ ( s ) \nu_{\pi_{\theta}}(s) νπθ(s) 是策略的状态访问分布。最后简单转换一下形式即证明完毕
∇ θ J ( θ ) ∝ ∑ s ∈ S ν π θ ( s ) ∑ a ∈ A Q π θ ( s , a ) ∇ θ π θ ( a ∣ s ) = ∑ s ∈ S ν π θ ( s ) ∑ a ∈ A π θ ( a ∣ s ) Q π θ ( s , a ) ∇ θ π θ ( a ∣ s ) π θ ( a ∣ s ) = E π θ [ Q π θ ( s , a ) ∇ θ log π θ ( a ∣ s ) ] \begin{aligned} \nabla_{\theta} J(\theta) & \propto \sum_{s \in S} \nu_{\pi_{\theta}}(s) \sum_{a \in A} Q_{\pi_{\theta}}(s, a) \nabla_{\theta} \pi_{\theta}(a \mid s) \\ & =\sum_{s \in S} \nu_{\pi_{\theta}}(s) \sum_{a \in A} \pi_{\theta}(a \mid s) Q_{\pi_{\theta}}(s, a) \frac{\nabla_{\theta} \pi_{\theta}(a \mid s)}{\pi_{\theta}(a \mid s)} \\ & =\mathbb{E}_{\pi_{\theta}}\left[Q_{\pi_{\theta}}(s, a) \nabla_{\theta} \log \pi_{\theta}(a \mid s)\right] \end{aligned} ∇θJ(θ)∝s∈S∑νπθ(s)a∈A∑Qπθ(s,a)∇θπθ(a∣s)=s∈S∑νπθ(s)a∈A∑πθ(a∣s)Qπθ(s,a)πθ(a∣s)∇θπθ(a∣s)=Eπθ[Qπθ(s,a)∇θlogπθ(a∣s)]
REINFORCE
:用实际 return u u u MC 近似 Q π ( s , a ) Q_\pi(s, a) Qπ(s,a)Actor-Critic
:用神经网络(Critic) q w ( s , a ) q_w(s, a) qw(s,a) 近似 Q π ( s , a ) Q_\pi(s, a) Qπ(s,a)class PolicyNet(torch.nn.Module):
''' 策略网络是一个两层 MLP '''
def __init__(self, input_dim, hidden_dim, output_dim):
super(PolicyNet, self).__init__()
self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
self.fc2 = torch.nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = F.relu(self.fc1(x)) # (1, hidden_dim)
x = F.softmax(self.fc2(x), dim=1) # (1, output_dim)
return x
softmax()
函数实现一个可学习的多项分布,并从中采样 action。在更新过程中,我们按算法将损失函数写为策略回报的负数,这样对 loss 求导后就可以通过梯度下降来更新策略。class REINFORCE(torch.nn.Module):
def __init__(self, state_dim, hidden_dim, action_range, learning_rate, gamma, device):
super().__init__()
self.policy_net = PolicyNet(state_dim, hidden_dim, action_range).to(device)
self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate) # 使用Adam优化器
self.gamma = gamma
self.device = device
def take_action(self, state):
# 根据动作概率分布随机采样
state = torch.tensor(state, dtype=torch.float).to(self.device)
state = state.unsqueeze(0)
probs = self.policy_net(state).squeeze()
action_dist = torch.distributions.Categorical(probs)
action = action_dist.sample()
return action.item()
def update(self, transition_dict):
reward_list = transition_dict['rewards']
state_list = transition_dict['states']
action_list = transition_dict['actions']
G = 0
self.optimizer.zero_grad()
# 从轨迹最后一步起往前计算 return,每步回传累计梯度
for i in reversed(range(len(reward_list))):
reward = reward_list[i]
state = torch.tensor(state_list[i], dtype=torch.float).to(self.device) # (state_dim, )
probs = self.policy_net(state.unsqueeze(0)).squeeze() # (action_range, )
action = action_list[i]
log_prob = torch.log(probs[action])
G = self.gamma * G + reward
loss = -log_prob * G
loss.backward()
# 梯度下降更新参数
self.optimizer.step()
if __name__ == "__main__":
def moving_average(a, window_size):
''' 生成序列 a 的滑动平均序列 '''
cumulative_sum = np.cumsum(np.insert(a, 0, 0))
middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
r = np.arange(1, window_size-1, 2)
begin = np.cumsum(a[:window_size-1])[::2] / r
end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
return np.concatenate((begin, middle, end))
def set_seed(env, seed=42):
''' 设置随机种子 '''
env.action_space.seed(seed)
env.reset(seed=seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
state_dim = 4 # 环境观测维度
action_dim = 1 # 环境动作维度
action_bins = 5 # 动作离散 bins 数量
action_range = action_bins * action_bins # 环境动作空间大小
reward_type = 'sparse' # sparse or dense
learning_rate = 1e-4
num_episodes = 500
hidden_dim = 64
gamma = 0.98
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# build environment
env = RollingBall(render_mode='human', width=5, height=5, show_epi=True, reward_type=reward_type)
env = FlattenActionSpaceWrapper(DiscreteActionWrapper(env, action_bins))
env = TimeLimit(env, 100)
check_env(env.unwrapped) # 检查环境是否符合 gym 规范
set_seed(env, 42)
# build agent
agent = REINFORCE(state_dim, hidden_dim, action_range, learning_rate, gamma, device)
# start training
return_list = []
for i in range(10):
with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
for i_episode in range(int(num_episodes / 10)):
episode_return = 0
transition_dict = {
'states': [],
'actions': [],
'next_states': [],
'rewards': [],
'dones': []
}
state, _ = env.reset()
# 以当前策略交互得到一条轨迹
while True:
action = agent.take_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
transition_dict['states'].append(state)
transition_dict['actions'].append(action)
transition_dict['next_states'].append(next_state)
transition_dict['rewards'].append(reward)
transition_dict['dones'].append(terminated or truncated)
state = next_state
episode_return += reward
if terminated or truncated:
env.render()
break
#env.render()
# 用当前策略收集的数据进行 on-policy 更新
agent.update(transition_dict)
# 更新进度条
return_list.append(episode_return)
pbar.set_postfix({
'episode':
'%d' % (num_episodes / 10 * i + i_episode + 1),
'return':
'%.3f' % episode_return,
'ave return':
'%.3f' % np.mean(return_list[-10:])
})
pbar.update(1)
# show policy performence
mv_return_list = moving_average(return_list, 29)
episodes_list = list(range(len(return_list)))
plt.figure(figsize=(12,8))
plt.plot(episodes_list, return_list, label='raw', alpha=0.5)
plt.plot(episodes_list, mv_return_list, label='moving ave')
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title(f'{agent._get_name()} on RollingBall with {reward_type} reward')
plt.legend()
plt.savefig(f'./result/{agent._get_name()}({reward_type}).png')
plt.show()
class PolicyNet(torch.nn.Module):
''' 策略网络是一个两层 MLP '''
def __init__(self, input_dim, hidden_dim, output_dim):
super(PolicyNet, self).__init__()
self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
self.fc2 = torch.nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = F.relu(self.fc1(x)) # (1, hidden_dim)
x = F.softmax(self.fc2(x), dim=1) # (1, output_dim)
return x
class QNet(torch.nn.Module):
''' 价值网络是一个两层 MLP '''
def __init__(self, input_dim, hidden_dim, output_dim):
super(QNet, self).__init__()
self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
self.fc2 = torch.nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class ActorCritic(torch.nn.Module):
def __init__(self, state_dim, hidden_dim, action_range, actor_lr, critic_lr, gamma, device):
super().__init__()
self.gamma = gamma
self.device = device
self.actor = PolicyNet(state_dim, hidden_dim, action_range).to(device)
self.critic = QNet(state_dim, hidden_dim, action_range).to(device)
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
def take_action(self, state):
state = torch.tensor(state, dtype=torch.float).to(self.device)
state = state.unsqueeze(0)
probs = self.actor(state)
action_dist = torch.distributions.Categorical(probs)
action = action_dist.sample()
return action.item()
def update_actor_cirtic(self, transition_dict):
states = torch.tensor(np.array(transition_dict['states']), dtype=torch.float).to(self.device) # (bsz, state_dim)
next_states = torch.tensor(np.array(transition_dict['next_states']), dtype=torch.float).to(self.device) # (bsz, state_dim)
actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(self.device) # (bsz, action_dim)
next_actions = torch.tensor(transition_dict['next_actions']).view(-1, 1).to(self.device) # (bsz, action_dim)
rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device).squeeze() # (bsz, )
dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device).squeeze() # (bsz, )
# Cirtic loss
q_values = self.critic(states).gather(dim=1, index=actions).squeeze() # (bsz, )
next_q_values = self.critic(next_states).gather(dim=1, index=next_actions).squeeze() # (bsz, )
td_targets = rewards + self.gamma * next_q_values * (1 - dones) # (bsz, )
critic_loss = torch.mean(F.mse_loss(q_values, td_targets.detach())) # td_targets 中包含 actor 给出的 next_action,将其 detach 以确保只更新 cirtic 参数
# Actor loss
probs = self.actor(states).gather(dim=1, index=actions).squeeze() # (bsz, )
log_probs = torch.log(probs) # (bsz, )
actor_loss = torch.mean(-log_probs * q_values.detach()) # q_values 是 critic 给出的,将其 detach 以确保只更新 actor 参数
# 更新网络参数
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
actor_loss.backward()
critic_loss.backward()
self.actor_optimizer.step()
self.critic_optimizer.step()
if __name__ == "__main__":
def moving_average(a, window_size):
''' 生成序列 a 的滑动平均序列 '''
cumulative_sum = np.cumsum(np.insert(a, 0, 0))
middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
r = np.arange(1, window_size-1, 2)
begin = np.cumsum(a[:window_size-1])[::2] / r
end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
return np.concatenate((begin, middle, end))
def set_seed(env, seed=42):
''' 设置随机种子 '''
env.action_space.seed(seed)
env.reset(seed=seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
state_dim = 4 # 环境观测维度
action_dim = 1 # 环境动作维度
action_bins = 5 # 动作离散 bins 数量
action_range = action_bins * action_bins # 环境动作空间大小
reward_type = 'sparse' # sparse or dense
actor_lr = 1e-2
critic_lr = 5e-3
num_episodes = 500
hidden_dim = 64
gamma = 0.98
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# build environment
env = RollingBall(render_mode='human', width=5, height=5, show_epi=True, reward_type=reward_type)
env = FlattenActionSpaceWrapper(DiscreteActionWrapper(env, action_bins))
env = TimeLimit(env, 100)
check_env(env.unwrapped) # 检查环境是否符合 gym 规范
set_seed(env, 42)
# build agent
agent = ActorCritic(state_dim, hidden_dim, action_range, actor_lr, critic_lr, gamma, device)
# start training
return_list = []
for i in range(10):
with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
for i_episode in range(int(num_episodes / 10)):
episode_return = 0
transition_dict = {
'states': [],
'actions': [],
'next_states': [],
'next_actions': [],
'rewards': [],
'dones': []
}
state, _ = env.reset()
# 以当前策略交互得到一条轨迹
while True:
action = agent.take_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
next_action = agent.take_action(next_state)
transition_dict['states'].append(state)
transition_dict['actions'].append(action)
transition_dict['next_states'].append(next_state)
transition_dict['next_actions'].append(next_action)
transition_dict['rewards'].append(reward)
transition_dict['dones'].append(terminated or truncated)
state = next_state
episode_return += reward
if terminated or truncated:
env.render()
break
#env.render()
# 用当前策略收集的数据进行 on-policy 更新
agent.update_actor_cirtic(transition_dict)
# 更新进度条
return_list.append(episode_return)
pbar.set_postfix({
'episode':
'%d' % (num_episodes / 10 * i + i_episode + 1),
'return':
'%.3f' % episode_return,
'ave return':
'%.3f' % np.mean(return_list[-10:])
})
pbar.update(1)
# show policy performence
mv_return_list = moving_average(return_list, 29)
episodes_list = list(range(len(return_list)))
plt.figure(figsize=(12,8))
plt.plot(episodes_list, return_list, label='raw', alpha=0.5)
plt.plot(episodes_list, mv_return_list, label='moving ave')
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title(f'{agent._get_name()} on RollingBall with {reward_type} reward')
plt.legend()
plt.savefig(f'./result/{agent._get_name()}({reward_type}).png')
plt.show()