gym-Pendulum v1 + DDPG(PyTorch)

Pendulum 是一个强化学习的经典游戏,游戏目标是希望控制红色的杆竖直向上。
gym-Pendulum v1 + DDPG(PyTorch)_第1张图片
Pendulum环境定义了坐标系、动力学方程等,还有state,action,reward。需要自己写的部分是网络结构和更新算法。
gym 源码:https://github.com/openai/gym/blob/master/gym/envs/classic_control/pendulum.py

倒立摆摆动问题是基于控制理论中的经典问题。该系统由一端连接到固定点的钟摆组成,另一端是自由的。钟摆从一个随机位置开始,目标是在自由端施加扭矩使其摆动。其重心位于固定点的正上方。

x,y:钟摆末端的笛卡尔坐标,单位为米。
θ:以弧度表示的角度。
tau:以‘N m’为单位的扭矩。定义为顺时针

action space 定义为(1,),物理意义是作用于倒立摆自由端的扭矩,值域[-2,2]

observation space 定义为(3,)
gym-Pendulum v1 + DDPG(PyTorch)_第2张图片
reward:r = -(theta2 + 0.1 * theta_dt2 + 0.001 * torque2)
其中 θ \theta θ 倒立摆角度,以弧度表示, θ ∈ [ − p i , p i ] \theta \in [-pi, pi] θ[pi,pi] (0是竖直向上).
基于以上公式,最小的reward是:-(pi2 + 0.1 * 82 + 0.001 * 22) = -16.2736044,
最大的reward是0 (竖直向上,速度是0,没有扭矩).

初始状态:角度随机 in [-pi, pi] ,角速度随机 in [-1,1].


代码参考了全网最多的那份pytorch版,最后跑出来的结果:
gym-Pendulum v1 + DDPG(PyTorch)_第3张图片
(前50 episode 在攒数据,没有训练)
reward波动有点大,不过看动画效果,模型是学到了的。


# 源码
from os import path
from typing import Optional
import numpy as np
import gym
from gym import spaces
from gym.envs.classic_control import utils
from gym.error import DependencyNotInstalled
from gym.utils.renderer import Renderer

DEFAULT_X = np.pi
DEFAULT_Y = 1.0

class PendulumEnv(gym.Env):
	def __init__(self, render_mode: Optional[str] = None, g=10.0):
        self.max_speed = 8
        self.max_torque = 2.0
        self.dt = 0.05
        self.g = g
        self.m = 1.0
        self.l = 1.0

        self.render_mode = render_mode
        self.renderer = Renderer(self.render_mode, self._render) #gym自带渲染器

        self.screen_dim = 500
        self.screen = None
        self.clock = None
        self.isopen = True

        high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32)
		
		#定义动作空间和观察空间
        self.action_space = spaces.Box(
            low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
        )
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)


    def step(self, u):
    	#step 函数返回四个值observation、reward、done、info
    	
		#observation (object):一个与环境相关的对象描述你观察到的环境,如相机的像素信息,机器人的角速度和角加速度,棋盘游戏中的棋盘状态。
		#reward (float):先前行为获得的所有回报之和,不同环境的计算方式不一,但目标总是增加自己的总回报。
		#done (boolean): 判断是否到了重新设定(reset)环境,大多数任务分为明确定义的episodes,并且完成为True表示episode已终止。
		#info (dict):用于调试的诊断信息,有时也用于学习,但正式的评价不允许使用该信息进行学习。 这是一个典型的agent-environment loop 的实现。

        th, thdot = self.state  # th := theta

        g = self.g
        m = self.m
        l = self.l
        dt = self.dt

        u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u  # for rendering
        costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)

        newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l**2) * u) * dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
        newth = th + newthdot * dt

        self.state = np.array([newth, newthdot])
        self.renderer.render_step()
        
        # observation、reward、done、info
        return self._get_obs(), -costs, False, False, {}


    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        if options is None:
            high = np.array([DEFAULT_X, DEFAULT_Y])
        else:
            # Note that if you use custom reset bounds, it may lead to out-of-bound state/observations.
            x = options.get("x_init") if "x_init" in options else DEFAULT_X
            y = options.get("y_init") if "y_init" in options else DEFAULT_Y
            x = utils.verify_number_and_cast(x)
            y = utils.verify_number_and_cast(y)
            high = np.array([x, y])
        low = -high  # We enforce symmetric limits.
        self.state = self.np_random.uniform(low=low, high=high)
        self.last_u = None

        self.renderer.reset()
        self.renderer.render_step()
        return self._get_obs(), {}

    def _get_obs(self):
    	# return observation
        theta, thetadot = self.state
        return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)

    def render(self):
        return self.renderer.get_renders()

    def _render(self, mode="human"):
        assert mode in self.metadata["render_modes"]
        try:
            import pygame
            from pygame import gfxdraw
        except ImportError:
            raise DependencyNotInstalled(
                "pygame is not installed, run `pip install gym[classic_control]`"
            )

        if self.screen is None:
            pygame.init()
            if mode == "human":
                pygame.display.init()
                self.screen = pygame.display.set_mode(
                    (self.screen_dim, self.screen_dim)
                )
            else:  # mode in {"rgb_array", "rgb_array_list"}
                self.screen = pygame.Surface((self.screen_dim, self.screen_dim))
        if self.clock is None:
            self.clock = pygame.time.Clock()

        self.surf = pygame.Surface((self.screen_dim, self.screen_dim))
        self.surf.fill((255, 255, 255))

        bound = 2.2
        scale = self.screen_dim / (bound * 2)
        offset = self.screen_dim // 2

        rod_length = 1 * scale
        rod_width = 0.2 * scale
        l, r, t, b = 0, rod_length, rod_width / 2, -rod_width / 2
        coords = [(l, b), (l, t), (r, t), (r, b)]
        transformed_coords = []
        for c in coords:
            c = pygame.math.Vector2(c).rotate_rad(self.state[0] + np.pi / 2)
            c = (c[0] + offset, c[1] + offset)
            transformed_coords.append(c)
        gfxdraw.aapolygon(self.surf, transformed_coords, (204, 77, 77))
        gfxdraw.filled_polygon(self.surf, transformed_coords, (204, 77, 77))

        gfxdraw.aacircle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77))
        gfxdraw.filled_circle(
            self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)
        )

        rod_end = (rod_length, 0)
        rod_end = pygame.math.Vector2(rod_end).rotate_rad(self.state[0] + np.pi / 2)
        rod_end = (int(rod_end[0] + offset), int(rod_end[1] + offset))
        gfxdraw.aacircle(
            self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
        )
        gfxdraw.filled_circle(
            self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
        )

        fname = path.join(path.dirname(__file__), "assets/clockwise.png")
        img = pygame.image.load(fname)
        if self.last_u is not None:
            scale_img = pygame.transform.smoothscale(
                img,
                (scale * np.abs(self.last_u) / 2, scale * np.abs(self.last_u) / 2),
            )
            is_flip = bool(self.last_u > 0)
            scale_img = pygame.transform.flip(scale_img, is_flip, True)
            self.surf.blit(
                scale_img,
                (
                    offset - scale_img.get_rect().centerx,
                    offset - scale_img.get_rect().centery,
                ),
            )

        # drawing axle
        gfxdraw.aacircle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
        gfxdraw.filled_circle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))

        self.surf = pygame.transform.flip(self.surf, False, True)
        self.screen.blit(self.surf, (0, 0))
        if mode == "human":
            pygame.event.pump()
            self.clock.tick(self.metadata["render_fps"])
            pygame.display.flip()

        else:  # mode == "rgb_array_list":
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
            )

    def close(self):
        if self.screen is not None:
            import pygame
            pygame.display.quit()
            pygame.quit()
            self.isopen = False


def angle_normalize(x):
    return ((x + np.pi) % (2 * np.pi)) - np.
# 大体框架
class PendulumEnv(gym.Env):
    def __init__(self, render_mode: Optional[str] = None, g=10.0):
        # 一些常量
		
		#图形界面需要的渲染器
        self.render_mode = render_mode
        self.renderer = Renderer(self.render_mode, self._render)

        #动作空间和状态空间
        self.action_space = spaces.Box(
            low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
        )
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)

    def step(self, u):
    	# 输入action,根据上一时刻state和新进来的action,更新state
        self.state = np.array([newth, newthdot])
        return self._get_obs(), -costs, False, False, {}

    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        #随机一个初始状态
        self.state = self.np_random.uniform(low=low, high=high)
        return self._get_obs(), {}

    def _get_obs(self):
        theta, thetadot = self.state
        return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)

#DDPG训练框架
for i in range(EPISODES):
    s = env.reset() #每轮开始前reset
    ep_r = 0
    ddpg.q_values=[]

    for j in range(EP_STEPS):
        if RENDER:
            env.render()

        # add explorative noise to action
        a = ddpg.choose_action(s)
        a = np.clip(np.random.normal(a, var), a_low_bound, a_bound)#给a加上一个方差为var的噪声,再截断

        s_, r, done, info = env.step(a)
        ddpg.store_transition(s, a, r/10, s_)  # store the transition to memory
        
        if ddpg.pointer > MEMORY_CAPACITY:
            var *= 0.9995  # decay the exploration controller factor
            ddpg.learn()

        s = s_
        ep_r += r
        if j == EP_STEPS - 1:
            print('Episode: ', i, ' step mean reward: %.4f' % (ep_r/EP_STEPS), 'Explore VAR: %.2f' % var)

            if ddpg.pointer > MEMORY_CAPACITY:
                x_list.append(i)
                ep_list.append(ep_r / EP_STEPS)
                q_values.append(np.mean(ddpg.q_values))

            if ep_r > -300:
                RENDER = True
            break

附一些DRL参考资料,待我日后仔细研究一下:
如何选择深度强化学习算法?(结论:连续动作空间推荐:擅长调参就用TD3,不擅长调参就用PPO或SAC,如果训练环境 Reward function 都是初学者写的就用PPO)
深度强化学习调参技巧
莫烦的教程主页
莫烦的DDPG

你可能感兴趣的:(pytorch,人工智能)