Gym是一个用于测试和比较强化学习算法的工具包,它不依赖强化学习算法结构,并且可以使用很多方法对它进行调用,像Tensorflow、Theano。Gym库收集、解决了很多环境的测试过程中的问题,能够很好地使得你的强化学习算法得到很好的工作。并且含有游戏界面,能够帮助你去写更适用的算法。
基本的Gym环境如下图所示:
import gym
env = gym.make('CartPole-v0')
for i_episode in range(20):
observation = env.reset()
for t in range(100):
env.render()
print(observation)
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t+1))
break
每一步环境都会返回Observation、Reward、Done、Info 四个值,而输入则是一个action。
在日常做强化学习的时候,很多时候是找不到合适的gym的环境的,对于用户自定义的环境,可以按照gym的标准进行搭建标准的强化学习环境,这样就可以很好的跟其他强化学习的算法或者接口进行对接。
Gym的开发框架如下所示:
import gym
from gym import spaces
class CustomEnv(gym.Env):
"""Custom Environment that follows gym interface"""
metadata = {'render.modes': ['human']}
def __init__(self, arg1, arg2, ...):
super(CustomEnv, self).__init__()
# Define action and observation space
# They must be gym.spaces objects
# Example when using discrete actions:
self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
# Example for using image as input:
self.observation_space = spaces.Box(low=0, high=255,
shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
def step(self, action):
...
return observation, reward, done, info
def reset(self):
...
return observation # reward, done, info can't be included
def render(self, mode='human'):
...
def close(self):
...
其中前三个为必须项。
搭建好后,可以使用stable_baselines3中的check_env检查环境
from stable_baselines3.common.env_checker import check_env
env = CustomEnv(arg1, ...)
# It will check your custom environment and output additional warnings if needed
check_env(env)
下面是两个用户自定义的gym环境,第一个是线性方程,第二个是flexsim仿真软件与强化学习的对接:
1.
import numpy as np
import pandas as pd
# 定义非线性系统环境,按照GYM的格式
import gym
from gym import spaces, logger
from gym.utils import seeding
class NonLinearEnv(gym.Env):
"""
描述:
一个离散时间非线性非仿射系统
来源:
论文《Policy Gradient Adaptive Dynamic Programming for Data-Based Optimal Control》
状态:
State1,State2
动作:
单输入系统,u
回报:
初始状态:
x0=[0.2, 0.7]'
episode结束条件:
"""
def __init__(self, Q: np.array, R: np.array):
self.Q = Q
self.R = R
self.state = np.array([0.2, 0.7])
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1, ), dtype=np.float64)
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2, ), dtype=np.float64)
def reset(self):
self.state = np.array([0.2, 0.7])
return self.state
def step(self, action):
next_state_x1 = (self.state[0]+self.state[1]**2+action)*np.cos(self.state[1])
next_state_x2 = 0.5*(self.state[0]**2+self.state[1]+action)*np.sin(self.state[1])
next_state = [next_state_x1, next_state_x2]
reward = np.matrix(self.state)*self.Q*np.matrix(self.state).T + action**2*self.R
self.state = np.array(next_state).reshape(2, )
done = False
info = {}
return self.state, -float(reward[0][0]), done, info
def render(self):
pass
测试代码:
from NLEnv import NonLinearEnv
from stable_baselines3.common.env_checker import check_env
import numpy as np
Q = np.matrix([[1, 0],
[0, 1]])
R = np.matrix([[1]])
env = NonLinearEnv(Q, R)
check_env(env)
import gym
import os
import subprocess
import socket
import json
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
class FlexSimEnv(gym.Env):
metadata = {'render.modes': ['human', 'rgb_array', 'ansi']}
def __init__(self, flexsimPath, modelPath, address='localhost', port=5005, verbose=False, visible=False):
self.flexsimPath = flexsimPath
self.modelPath = modelPath
self.address = address
self.port = port
self.verbose = verbose
self.visible = visible
self.lastObservation = ""
self._launch_flexsim()
self.action_space = self._get_action_space()
self.observation_space = self._get_observation_space()
def reset(self):
self._reset_flexsim()
state, reward, done = self._get_observation()
return state
def step(self, action):
self._take_action(action)
state, reward, done = self._get_observation()
info = {}
return state, reward, done, info
def render(self, mode='human'):
if mode == 'rgb_array':
return np.array([0,0,0])
elif mode == 'human':
print(self.lastObservation)
elif mode == 'ansi':
return self.lastObservation
else:
super(FlexSimEnv, self).render(mode=mode)
def close(self):
self._close_flexsim()
def seed(self, seed=None):
self.seedNum = seed
return self.seedNum
def _launch_flexsim(self):
if self.verbose:
print("Launching " + self.flexsimPath + " " + self.modelPath)
args = [self.flexsimPath, self.modelPath, "-training", self.address + ':' + str(self.port)]
if self.visible == False:
args.append("-maintenance")
args.append("nogui")
self.flexsimProcess = subprocess.Popen(args)
self._socket_init(self.address, self.port)
def _close_flexsim(self):
self.flexsimProcess.kill()
def _release_flexsim(self):
if self.verbose:
print("Sending StopWaiting message")
self._socket_send(b"StopWaiting?")
def _get_action_space(self):
self._socket_send(b"ActionSpace?")
if self.verbose:
print("Waiting for ActionSpace message")
actionSpaceBytes = self._socket_recv()
return self._convert_to_gym_space(actionSpaceBytes)
def _get_observation_space(self):
self._socket_send(b"ObservationSpace?")
if self.verbose:
print("Waiting for ObservationSpace message")
observationSpaceBytes = self._socket_recv()
return self._convert_to_gym_space(observationSpaceBytes)
def _reset_flexsim(self):
if self.verbose:
print("Sending Reset message")
resetString = "Reset?"
if hasattr(self, "seedNum"):
resetString = "Reset:" + str(self.seedNum) + "?"
self._socket_send(resetString.encode())
def _get_observation(self):
if self.verbose:
print("Waiting for Observation message")
observationBytes = self._socket_recv()
self.lastObservation = observationBytes.decode('utf-8')
state, reward, done = self._convert_to_observation(observationBytes)
return state, reward, done
def _take_action(self, action):
actionStr = json.dumps(action, cls=NumpyEncoder)
if self.verbose:
print("Sending Action message: " + actionStr)
actionMessage = "TakeAction:" + actionStr + "?"
self._socket_send(actionMessage.encode())
def _socket_init(self, host, port):
if self.verbose:
print("Waiting for FlexSim to connect to socket on " + self.address + ":" + str(self.port))
self.serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.serversocket.bind((host, port))
self.serversocket.listen();
(self.clientsocket, self.socketaddress) = self.serversocket.accept()
if self.verbose:
print("Socket connected")
if self.verbose:
print("Waiting for READY message")
message = self._socket_recv()
if self.verbose:
print(message.decode('utf-8'))
if message != b"READY":
raise RuntimeError("Did not receive READY! message")
def _socket_send(self, msg):
totalsent = 0
while totalsent < len(msg):
sent = self.clientsocket.send(msg[totalsent:])
if sent == 0:
raise RuntimeError("Socket connection broken")
totalsent = totalsent + sent
def _socket_recv(self):
chunks = []
while 1:
chunk = self.clientsocket.recv(2048)
if chunk == b'':
raise RuntimeError("Socket connection broken")
if chunk[-1] == ord('!'):
chunks.append(chunk[:-1])
break;
else:
chunks.append(chunk)
return b''.join(chunks)
def _convert_to_gym_space(self, spaceBytes):
paramsStartIndex = spaceBytes.index(ord('('))
paramsEndIndex = spaceBytes.index(ord(')'), paramsStartIndex)
type = spaceBytes[:paramsStartIndex]
params = json.loads(spaceBytes[paramsStartIndex+1:paramsEndIndex])
if type == b'Discrete':
return gym.spaces.Discrete(params)
elif type == b'Box':
return gym.spaces.Box(np.array(params[0]), np.array(params[1]))
elif type == b'MultiDiscrete':
return gym.spaces.MultiDiscrete(params)
elif type == b'MultiBinary':
return gym.spaces.MultiBinary(params)
raise RuntimeError("Could not parse gym space string")
def _convert_to_observation(self, spaceBytes):
observation = json.loads(spaceBytes)
state = observation["state"]
if isinstance(state, list):
state = np.array(observation["state"])
reward = observation["reward"]
done = (observation["done"] == 1)
return state, reward, done
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
def main():
env = FlexSimEnv(
flexsimPath = "C:/Program Files/FlexSim 2022/program/flexsim.exe",
modelPath = "./ChangeoverTimesRL.fsm",
verbose = True,
visible = True
)
for i in range(2):
env.seed(i)
observation = env.reset()
env.render()
done = False
rewards = []
while not done:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
env.render()
rewards.append(reward)
if done:
cumulative_reward = sum(rewards)
print("Reward: ", cumulative_reward, "\n")
env._release_flexsim()
input("Waiting for input to close FlexSim...")
env.close()
if __name__ == "__main__":
main()