Pendulum 是一个强化学习的经典游戏,游戏目标是希望控制红色的杆竖直向上。
Pendulum环境定义了坐标系、动力学方程等,还有state,action,reward。需要自己写的部分是网络结构和更新算法。
gym 源码:https://github.com/openai/gym/blob/master/gym/envs/classic_control/pendulum.py
倒立摆摆动问题是基于控制理论中的经典问题。该系统由一端连接到固定点的钟摆组成,另一端是自由的。钟摆从一个随机位置开始,目标是在自由端施加扭矩使其摆动。其重心位于固定点的正上方。
x,y:钟摆末端的笛卡尔坐标,单位为米。
θ:以弧度表示的角度。
tau:以‘N m’为单位的扭矩。定义为顺时针
action space 定义为(1,),物理意义是作用于倒立摆自由端的扭矩,值域[-2,2]
observation space 定义为(3,)
reward:r = -(theta2 + 0.1 * theta_dt2 + 0.001 * torque2)
其中 θ \theta θ 倒立摆角度,以弧度表示, θ ∈ [ − p i , p i ] \theta \in [-pi, pi] θ∈[−pi,pi] (0是竖直向上).
基于以上公式,最小的reward是:-(pi2 + 0.1 * 82 + 0.001 * 22) = -16.2736044,
最大的reward是0 (竖直向上,速度是0,没有扭矩).
初始状态:角度随机 in [-pi, pi] ,角速度随机 in [-1,1].
代码参考了全网最多的那份pytorch版,最后跑出来的结果:
(前50 episode 在攒数据,没有训练)
reward波动有点大,不过看动画效果,模型是学到了的。
# 源码
from os import path
from typing import Optional
import numpy as np
import gym
from gym import spaces
from gym.envs.classic_control import utils
from gym.error import DependencyNotInstalled
from gym.utils.renderer import Renderer
DEFAULT_X = np.pi
DEFAULT_Y = 1.0
class PendulumEnv(gym.Env):
def __init__(self, render_mode: Optional[str] = None, g=10.0):
self.max_speed = 8
self.max_torque = 2.0
self.dt = 0.05
self.g = g
self.m = 1.0
self.l = 1.0
self.render_mode = render_mode
self.renderer = Renderer(self.render_mode, self._render) #gym自带渲染器
self.screen_dim = 500
self.screen = None
self.clock = None
self.isopen = True
high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32)
#定义动作空间和观察空间
self.action_space = spaces.Box(
low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
)
self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)
def step(self, u):
#step 函数返回四个值observation、reward、done、info
#observation (object):一个与环境相关的对象描述你观察到的环境,如相机的像素信息,机器人的角速度和角加速度,棋盘游戏中的棋盘状态。
#reward (float):先前行为获得的所有回报之和,不同环境的计算方式不一,但目标总是增加自己的总回报。
#done (boolean): 判断是否到了重新设定(reset)环境,大多数任务分为明确定义的episodes,并且完成为True表示episode已终止。
#info (dict):用于调试的诊断信息,有时也用于学习,但正式的评价不允许使用该信息进行学习。 这是一个典型的agent-environment loop 的实现。
th, thdot = self.state # th := theta
g = self.g
m = self.m
l = self.l
dt = self.dt
u = np.clip(u, -self.max_torque, self.max_torque)[0]
self.last_u = u # for rendering
costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l**2) * u) * dt
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
newth = th + newthdot * dt
self.state = np.array([newth, newthdot])
self.renderer.render_step()
# observation、reward、done、info
return self._get_obs(), -costs, False, False, {}
def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
super().reset(seed=seed)
if options is None:
high = np.array([DEFAULT_X, DEFAULT_Y])
else:
# Note that if you use custom reset bounds, it may lead to out-of-bound state/observations.
x = options.get("x_init") if "x_init" in options else DEFAULT_X
y = options.get("y_init") if "y_init" in options else DEFAULT_Y
x = utils.verify_number_and_cast(x)
y = utils.verify_number_and_cast(y)
high = np.array([x, y])
low = -high # We enforce symmetric limits.
self.state = self.np_random.uniform(low=low, high=high)
self.last_u = None
self.renderer.reset()
self.renderer.render_step()
return self._get_obs(), {}
def _get_obs(self):
# return observation
theta, thetadot = self.state
return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)
def render(self):
return self.renderer.get_renders()
def _render(self, mode="human"):
assert mode in self.metadata["render_modes"]
try:
import pygame
from pygame import gfxdraw
except ImportError:
raise DependencyNotInstalled(
"pygame is not installed, run `pip install gym[classic_control]`"
)
if self.screen is None:
pygame.init()
if mode == "human":
pygame.display.init()
self.screen = pygame.display.set_mode(
(self.screen_dim, self.screen_dim)
)
else: # mode in {"rgb_array", "rgb_array_list"}
self.screen = pygame.Surface((self.screen_dim, self.screen_dim))
if self.clock is None:
self.clock = pygame.time.Clock()
self.surf = pygame.Surface((self.screen_dim, self.screen_dim))
self.surf.fill((255, 255, 255))
bound = 2.2
scale = self.screen_dim / (bound * 2)
offset = self.screen_dim // 2
rod_length = 1 * scale
rod_width = 0.2 * scale
l, r, t, b = 0, rod_length, rod_width / 2, -rod_width / 2
coords = [(l, b), (l, t), (r, t), (r, b)]
transformed_coords = []
for c in coords:
c = pygame.math.Vector2(c).rotate_rad(self.state[0] + np.pi / 2)
c = (c[0] + offset, c[1] + offset)
transformed_coords.append(c)
gfxdraw.aapolygon(self.surf, transformed_coords, (204, 77, 77))
gfxdraw.filled_polygon(self.surf, transformed_coords, (204, 77, 77))
gfxdraw.aacircle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77))
gfxdraw.filled_circle(
self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)
)
rod_end = (rod_length, 0)
rod_end = pygame.math.Vector2(rod_end).rotate_rad(self.state[0] + np.pi / 2)
rod_end = (int(rod_end[0] + offset), int(rod_end[1] + offset))
gfxdraw.aacircle(
self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
)
gfxdraw.filled_circle(
self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
)
fname = path.join(path.dirname(__file__), "assets/clockwise.png")
img = pygame.image.load(fname)
if self.last_u is not None:
scale_img = pygame.transform.smoothscale(
img,
(scale * np.abs(self.last_u) / 2, scale * np.abs(self.last_u) / 2),
)
is_flip = bool(self.last_u > 0)
scale_img = pygame.transform.flip(scale_img, is_flip, True)
self.surf.blit(
scale_img,
(
offset - scale_img.get_rect().centerx,
offset - scale_img.get_rect().centery,
),
)
# drawing axle
gfxdraw.aacircle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
gfxdraw.filled_circle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
self.surf = pygame.transform.flip(self.surf, False, True)
self.screen.blit(self.surf, (0, 0))
if mode == "human":
pygame.event.pump()
self.clock.tick(self.metadata["render_fps"])
pygame.display.flip()
else: # mode == "rgb_array_list":
return np.transpose(
np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
)
def close(self):
if self.screen is not None:
import pygame
pygame.display.quit()
pygame.quit()
self.isopen = False
def angle_normalize(x):
return ((x + np.pi) % (2 * np.pi)) - np.
# 大体框架
class PendulumEnv(gym.Env):
def __init__(self, render_mode: Optional[str] = None, g=10.0):
# 一些常量
#图形界面需要的渲染器
self.render_mode = render_mode
self.renderer = Renderer(self.render_mode, self._render)
#动作空间和状态空间
self.action_space = spaces.Box(
low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
)
self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)
def step(self, u):
# 输入action,根据上一时刻state和新进来的action,更新state
self.state = np.array([newth, newthdot])
return self._get_obs(), -costs, False, False, {}
def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
super().reset(seed=seed)
#随机一个初始状态
self.state = self.np_random.uniform(low=low, high=high)
return self._get_obs(), {}
def _get_obs(self):
theta, thetadot = self.state
return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)
#DDPG训练框架
for i in range(EPISODES):
s = env.reset() #每轮开始前reset
ep_r = 0
ddpg.q_values=[]
for j in range(EP_STEPS):
if RENDER:
env.render()
# add explorative noise to action
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, var), a_low_bound, a_bound)#给a加上一个方差为var的噪声,再截断
s_, r, done, info = env.step(a)
ddpg.store_transition(s, a, r/10, s_) # store the transition to memory
if ddpg.pointer > MEMORY_CAPACITY:
var *= 0.9995 # decay the exploration controller factor
ddpg.learn()
s = s_
ep_r += r
if j == EP_STEPS - 1:
print('Episode: ', i, ' step mean reward: %.4f' % (ep_r/EP_STEPS), 'Explore VAR: %.2f' % var)
if ddpg.pointer > MEMORY_CAPACITY:
x_list.append(i)
ep_list.append(ep_r / EP_STEPS)
q_values.append(np.mean(ddpg.q_values))
if ep_r > -300:
RENDER = True
break
附一些DRL参考资料,待我日后仔细研究一下:
如何选择深度强化学习算法?(结论:连续动作空间推荐:擅长调参就用TD3,不擅长调参就用PPO或SAC,如果训练环境 Reward function 都是初学者写的就用PPO)
深度强化学习调参技巧
莫烦的教程主页
莫烦的DDPG