# 实际使用时,可挂载volume来保存container中自己想保存的数据(后续细说)
nvidia-docker run -itd --name stable_baselines --network host dm_control:py38_mujoco210 bash
# 以用户pamirl的身份进入
docker exec -it -u pamirl stable_baselines bash
cd ~
conda activate deep_rl
pip install stable-baselines3[extra]
pip install pyglet
python test_mujoco.py
# test_mujoco.py
import mujoco_py
import os
mj_path = mujoco_py.utils.discover_mujoco()
xml_path = os.path.join(mj_path, 'model', 'humanoid.xml')
model = mujoco_py.load_model_from_path(xml_path)
sim = mujoco_py.MjSim(model)
print(sim.data.qpos)
sim.step()
print(sim.data.qpos)
结果如下:
5. 再测试一下cuda是否正常:python test_cuda.py
# test_cuda.py
import torch
print(torch.__version__)
print(torch.cuda.is_available()) #cuda是否可用
print(torch.cuda.device_count()) #返回GPU的数量
print(torch.cuda.get_device_name(0)) #返回gpu名字
python test_stable_baselines.py
# test_stable_baselines.py
import gym
from stable_baselines3 import PPO
env = gym.make("CartPole-v1")
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)
obs = env.reset()
for i in range(1000):
action, _states = model.predict(obs, deterministic=True)
obs, reward, done, info = env.step(action)
#env.render()
if done:
obs = env.reset()
env.close()
跑了10万步的训练结果如下:
7. 调用PPO跑一百万步的HalfCheetah-v3,结果如下:
在github上看看stable-baselines的setup.py,对照一下关键包的版本问题
conda list | grep gym
,容器内gym版本刚好满足,pytorch更不用说啦,本来安装的就是1.10.1版本!conda list | grep atari
pip show stable-baselines3
:查看包的metadatacd /home/pamirl/miniconda3/envs/deep_rl/lib/python3.8/site-packages && tree -L 2
:查看包的结构pip3 install pybullet --upgrade --user
:惊喜地发现使用pybullet环境也非常简单
# python test_pybullet.py
import os
import gym
import pybullet_envs
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3 import PPO
env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
# Automatically normalize the input features and reward
env = VecNormalize(env, norm_obs=True, norm_reward=True,
clip_obs=10.)
model = PPO('MlpPolicy', env)
model.learn(total_timesteps=2000)
print("finish learning")
# Don't forget to save the VecNormalize statistics when saving the agent
log_dir = "/tmp/"
model.save(log_dir + "ppo_halfcheetah")
print("saving model at {0}".format(log_dir))
stats_path = os.path.join(log_dir, "vec_normalize.pkl")
env.save(stats_path)
print("saving env at {}".format(stats_path))
# To demonstrate loading
del model, env
# Load the saved statistics
env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
print("loading env and model")
env = VecNormalize.load(stats_path, env)
# do not update them at test time
env.training = False
# reward normalization is not needed at test time
env.norm_reward = False
# Load the agent
model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
如果想使用atari环境,还需要装ROMs,为了完整性:
wget http://www.atarimania.com/roms/Roms.rar
unrar e ~/atari_roms
python -m atari_py.import_roms ~/atari_roms
实际上,我们仅仅是在原镜像基础上pip install stable-baselines3[extra] pyglet pybullet,可能再git clone一个RL Baselinses Zoo,呵,继续搬砖了!
下次有时间再仔细写用jupyter notebook的远端开发RL算法的过程
https://github.com/DLR-RM/stable-baselines3 ↩︎
https://github.com/DLR-RM/rl-baselines3-zoo ↩︎
https://github.com/bulletphysics/bullet3 ↩︎