OpenAI gym是强化学习最为流行的实验环境。某种程度上,其接口已经成为了标准。一方面,很多算法实现都是基于gym开发;另一方面,新的场景也会封装成gym接口。经过这样一层抽象,算法与实验环境充分解耦隔离,可以方便地自由组合。但gym是python的接口,如果想用C++实现强化学习算法,则无法直接与gym相接。一种方案是跨进程:一个进程运行python环境,另一个进程运行强化学习算法,与环境交互数据经过序列化和反序列化通过IPC进行通信。另一种是单进程方案:gym和强化学习算法跑在同一进程,通过python binding来连接。本文尝试通过pybind11来桥接,从而实现在同一进程中gym与强化学习算法通信的目的。
C++机器学习框架采用PyTorch提供的Libtorch。因为在目前主流的几个训练框架中,在C++版本上相比下它是算支持地比较好的,安装也算方便。安装流程见INSTALLING C++ DISTRIBUTIONS OF PYTORCH。官方提供的sample中提供了一个REINFORCE算法(一种虽然比较古老但很经典的RL算法)的例子reinforce.py,我们就先以它为例。
首先来看python部分,参考原sample做一些小改动。将强化学习算法调用接口抽象在RLWrapper。这个类后面会binding到C++。初始化时传入gym环境的状态和动态空间描述,reset()函数通知环境重置并传入初始状态,act()函数根据当前状态根据策略给出动作,update()函数进行策略函数参数学习。
...
def state_space_desc(space):
if isinstance(space, gym.spaces.Box):
assert(type(space.shape)==tuple)
return dict(stype='Box', dtype=str(space.dtype), shape=space.shape)
else:
raise NotImplementedError('unknown state space {}'.format(space))
def action_space_desc(space):
if isinstance(space, gym.spaces.Discrete):
return dict(stype='Discrete', dtype=str(space.dtype), shape=(space.n,))
else:
raise NotImplementedError('unknown action space {}'.format(space))
def main(args):
env = gym.make(args.env)
env.seed(args.seed)
agent = nativerl.RLWrapper(state_space_desc(env.observation_space),
action_space_desc(env.action_space))
running_reward = 10
for i in range(args.epoch):
obs = env.reset()
ep_reward = 0
agent.reset(obs)
for t in range(1, args.step):
if args.render:
env.render()
action = agent.act(obs)
obs, reward, done, info = env.step(action)
agent.update(reward, done)
ep_reward += reward
if done:
break
running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
agent.episode_finish()
if i % args.log_itv == 0:
print("Episode {}\t Last reward: {:.2f}\t step: {}\t Average reward: {:.2f}"
.format(i, ep_reward, t, running_reward))
if env.spec.reward_threshold and running_reward > env.spec.reward_threshold:
print("Solved. Running reward: {}, Last reward: {}".format(running_reward, t))
break
env.close()
然后是RLWrapper的python binding部分。这里主要是将python的对象转为C++的数据结构。
...
namespace py = pybind11;
class RLWrapper {
public:
RLWrapper(const py::dict& state_space, const py::dict& action_space) {
spdlog::set_level(spdlog::level::info);
torch::manual_seed(nrl::kSeed);
nrl::SpaceDesc ss;
nrl::SpaceDesc as;
ss.stype = py::cast<std::string>(state_space["stype"]);
as.stype = py::cast<std::string>(action_space["stype"]);
ss.dtype = py::cast<std::string>(state_space["dtype"]);
as.dtype = py::cast<std::string>(action_space["dtype"]);
py::tuple shape;
shape = py::cast<py::tuple>(state_space["shape"]);
for (const auto& item : shape) {
ss.shape.push_back(py::cast<int64_t>(item));
}
shape = py::cast<py::tuple>(action_space["shape"]);
for (const auto& item : shape) {
as.shape.push_back(py::cast<int64_t>(item));
}
mStateSpaceDesc = ss;
mActionSpaceDesc = as;
mAgent = std::make_shared<nrl::Reinforce>(ss, as);
}
void reset(py::array_t<float, py::array::c_style | py::array::forcecast> state) {
py::buffer_info buf = state.request();
float* pbuf = static_cast<float*>(buf.ptr);
assert(buf.shape == mStateSpaceDesc.shape);
mAgent->reset(nrl::Blob{pbuf, mStateSpaceDesc.shape});
}
py::object act(py::array_t<float, py::array::c_style | py::array::forcecast> state) {
py::buffer_info buf = state.request();
float* pbuf = static_cast<float *>(buf.ptr);
assert(buf.shape == mStateSpaceDesc.shape);
torch::Tensor action = mAgent->act(nrl::Blob{pbuf, mStateSpaceDesc.shape}).contiguous().cpu();
return py::int_(action.item<long>());
}
void update(float reward, bool done) {
mAgent->update(reward, done);
}
void episode_finish() {
spdlog::trace("{}", __func__);
mAgent->onEpisodeFinished();
}
~RLWrapper() {
}
private:
nrl::SpaceDesc mStateSpaceDesc;
nrl::SpaceDesc mActionSpaceDesc;
std::shared_ptr<nrl::RLBase> mAgent;
};
PYBIND11_MODULE(nativerl, m) {
py::class_<RLWrapper>(m, "RLWrapper")
.def(py::init<const py::dict &, const py::dict &>())
.def("reset", &RLWrapper::reset)
.def("episode_finish", &RLWrapper::episode_finish)
.def("act", &RLWrapper::act)
.def("update", &RLWrapper::update);
}
可以说这是python和C++的glue层。主要的工作我们放到RLBase类中。它是一个抽象类,定义了几个强化学习的基本接口。我们将REINFORCE算法实现在其继承类Reinforce中:
...
class Reinforce : public RLBase {
public:
Reinforce(const SpaceDesc& ss, const SpaceDesc& as)
: mPolicy(std::make_shared<Policy>(ss, as, mDevice)) {
mPolicy->to(mDevice);
mRewards = torch::zeros({mCapacity}, torch::TensorOptions(mDevice));
mReturns = torch::zeros({mCapacity}, torch::TensorOptions(mDevice));
mOptimizer = std::make_shared<torch::optim::Adam>(mPolicy->parameters(),
torch::optim::AdamOptions(mInitLR));
}
virtual torch::Tensor act(const Blob& s) override {
auto state = torch::from_blob(s.pbuf, s.shape).unsqueeze(0).to(mDevice);
torch::Tensor action;
torch::Tensor logProb;
std::tie(action, logProb) = mPolicy->act(state);
mLogProbs.push_back(logProb);
return action;
}
void update(float r, __attribute__((unused)) bool done) {
mRewards[mSize++] = r;
if (mSize >= mCapacity) {
spdlog::info("buffer has been full, call train()");
train();
}
}
virtual void onEpisodeFinished() override {
train();
}
private:
void train() {
spdlog::trace("{}: buffer size = {}", __func__, mSize);
for (auto i = mSize - 1; i >= 0; --i) {
if (i == (mSize - 1)) {
mReturns[i] = mRewards[i];
} else {
mReturns[i] = mReturns[i + 1] * mGamma + mRewards[i];
}
}
auto returns = mReturns.slice(0, 0, mSize);
returns = (returns - returns.mean()) / (returns.std() + kEps);
auto logprobs = torch::cat(mLogProbs);
mOptimizer->zero_grad();
auto policy_loss = -(logprobs * returns).sum();
policy_loss.backward();
mOptimizer->step();
mLogProbs.clear();
mSize = 0;
++mCount;
spdlog::debug("{} : episode {}: loss = {}, accumulated reward = {}",
__func__, mCount, policy_loss.item<float>(), mRewards.sum().item<float>());
}
std::shared_ptr<Policy> mPolicy;
torch::Tensor mRewards;
std::vector<torch::Tensor> mLogProbs;
torch::Tensor mReturns;
int32_t mSize{0};
int32_t mCapacity{kExpBufferCap};
std::shared_ptr<torch::optim::Adam> mOptimizer;
uint32_t mCount{0};
float mGamma{0.99};
float mInitLR{1e-2};
};
Sample中的场景为CartPole,场景比较简单,因此其中的策略函数实现为简单的MLP。更为复杂的场景可以替换为复杂的网络结构。
...
class Net : public nn::Module {
public:
virtual std::tuple<Tensor, Tensor> forward(Tensor x) = 0;
virtual ~Net() = default;
};
class MLP : public Net {
public:
MLP(int64_t inputSize, int64_t actionNum) {
mFC1 = register_module("fc1", nn::Linear(inputSize, mHiddenSize));
mAction = register_module("action", nn::Linear(mHiddenSize, actionNum));
mValue = register_module("value", nn::Linear(mHiddenSize, actionNum));
}
virtual std::tuple<Tensor, Tensor> forward(Tensor x) override {
x = mFC1->forward(x);
x = dropout(x, 0.6, is_training());
x = relu(x);
return std::make_tuple(mAction->forward(x), mValue->forward(x));
}
private:
nn::Linear mFC1{nullptr};
nn::Linear mAction{nullptr};
nn::Linear mValue{nullptr};
int64_t mHiddenSize{128};
};
class Policy : public torch::nn::Module {
public:
Policy(const SpaceDesc& ss, const SpaceDesc& as, torch::Device mDevice)
: mActionSpaceType(as.stype),
mActionNum(as.shape[0]),
mGen(kSeed),
mUniformDist(0, 1.0) {
if (ss.shape.size() == 1) {
mNet = std::make_shared<MLP>(ss.shape[0], as.shape[0]);
} else {
mNet = std::make_shared<CNN>(ss.shape, as.shape[0]);
}
mNet->to(mDevice);
register_module("base", mNet);
torch::Tensor logits = torch::ones({1, as.shape[0]}, torch::TensorOptions(mDevice));
mUniformCategorical = std::make_shared<Categorical>(nullptr, &logits);
}
torch::Tensor forward(torch::Tensor x) {
x = std::get<0>(mNet->forward(x));
return torch::softmax(x, 1);
}
std::tuple<torch::Tensor, torch::Tensor> act(torch::Tensor state) {
auto output = forward(state);
std::shared_ptr<Distribution> dist;
if (!mActionSpaceType.compare("Discrete")) {
dist = std::make_shared<Categorical>(&output);
} else {
throw std::logic_error("Not implemented : action space");
}
float rnd = mUniformDist(mGen);
float threshold = kEpsEnd + (kEpsStart - kEpsEnd) * exp(-1. * mStep / kEpsDecay);
++mStep;
torch::Tensor action;
if (rnd > threshold) {
torch::NoGradGuard no_grad;
action = dist->sample();
} else {
torch::NoGradGuard no_grad;
action = mUniformCategorical->sample({1}).squeeze(-1);
}
auto log_probs = dist->log_prob(action);
return std::make_tuple(action, log_probs);
}
private:
std::string mActionSpaceType;
int32_t mActionNum;
int64_t mHiddenSize{128};
std::shared_ptr<Net> mNet;
uint64_t mStep{0};
std::mt19937 mGen;
std::uniform_real_distribution<float> mUniformDist;
std::shared_ptr<Categorical> mUniformCategorical;
};
其中的Categorical类为Categorical distribution相关计算,可以根据PyTorch中的python版本重写成C++。
最后,将上面的C++实现编译成so。根据实际情况在CMakeLists.txt中加入:
...
set(CMAKE_CXX_STANDARD 11)
find_package(Torch REQUIRED)
set(NRL_INCLUDE_DIRS
src
${TORCH_INCLUDE_DIRS})
file(GLOB NRL_SOURCES1 "src/*.cpp")
list(APPEND NRL_SOURCES ${NRL_SOURCES1})
message(STATUS "sources: ${NRL_SOURCES}")
add_subdirectory(third_party/pybind11)
add_subdirectory(third_party/spdlog)
pybind11_add_module(nativerl ${NRL_SOURCES})
target_include_directories(nativerl PRIVATE ${NRL_INCLUDE_DIRS})
target_link_libraries(nativerl PRIVATE spdlog::spdlog ${TORCH_LIBRARIES})
...
假设编译出的so位于build目录下,python脚本为example/simple.py。则可以通过命令开始训练:
PYTHONPATH=./build python -m example.simple
正常的话可以看到类似的训练过程log及结果,基本和python版本一致。
[2019-06-22 13:42:22.533] [info] state space type:Box shape size:1
[2019-06-22 13:42:22.534] [info] action space type:Discrete, shape size:1
[2019-06-22 13:42:22.534] [info] Training on GPU (CUDA)
Episode 0 Last reward: 29.00 step: 29 Average reward: 10.95
Episode 10 Last reward: 17.00 step: 17 Average reward: 14.73
Episode 20 Last reward: 12.00 step: 12 Average reward: 17.40
Episode 30 Last reward: 15.00 step: 15 Average reward: 24.47
Episode 40 Last reward: 18.00 step: 18 Average reward: 26.22
Episode 50 Last reward: 18.00 step: 18 Average reward: 23.69
Episode 60 Last reward: 72.00 step: 72 Average reward: 30.21
Episode 70 Last reward: 19.00 step: 19 Average reward: 28.83
Episode 80 Last reward: 29.00 step: 29 Average reward: 32.13
Episode 90 Last reward: 15.00 step: 15 Average reward: 29.64
Episode 100 Last reward: 30.00 step: 30 Average reward: 27.88
Episode 110 Last reward: 12.00 step: 12 Average reward: 26.14
Episode 120 Last reward: 28.00 step: 28 Average reward: 26.32
Episode 130 Last reward: 11.00 step: 11 Average reward: 31.20
Episode 140 Last reward: 112.00 step: 112 Average reward: 35.26
Episode 150 Last reward: 40.00 step: 40 Average reward: 37.14
Episode 160 Last reward: 40.00 step: 40 Average reward: 36.84
Episode 170 Last reward: 15.00 step: 15 Average reward: 41.91
Episode 180 Last reward: 63.00 step: 63 Average reward: 49.78
Episode 190 Last reward: 21.00 step: 21 Average reward: 44.70
Episode 200 Last reward: 46.00 step: 46 Average reward: 41.83
Episode 210 Last reward: 80.00 step: 80 Average reward: 51.55
Episode 220 Last reward: 151.00 step: 151 Average reward: 57.82
Episode 230 Last reward: 176.00 step: 176 Average reward: 62.80
Episode 240 Last reward: 19.00 step: 19 Average reward: 63.17
Episode 250 Last reward: 134.00 step: 134 Average reward: 74.02
Episode 260 Last reward: 46.00 step: 46 Average reward: 71.35
Episode 270 Last reward: 118.00 step: 118 Average reward: 85.88
Episode 280 Last reward: 487.00 step: 487 Average reward: 112.74
Episode 290 Last reward: 95.00 step: 95 Average reward: 139.41
Episode 300 Last reward: 54.00 step: 54 Average reward: 149.20
Episode 310 Last reward: 417.00 step: 417 Average reward: 138.42
Episode 320 Last reward: 500.00 step: 500 Average reward: 179.29
Episode 330 Last reward: 71.00 step: 71 Average reward: 195.88
Episode 340 Last reward: 309.00 step: 309 Average reward: 216.82
Episode 350 Last reward: 268.00 step: 268 Average reward: 214.21
Episode 360 Last reward: 243.00 step: 243 Average reward: 210.89
Episode 370 Last reward: 266.00 step: 266 Average reward: 200.03
Episode 380 Last reward: 379.00 step: 379 Average reward: 220.06
Episode 390 Last reward: 500.00 step: 500 Average reward: 316.20
Episode 400 Last reward: 500.00 step: 500 Average reward: 369.46
Episode 410 Last reward: 500.00 step: 500 Average reward: 421.84
Episode 420 Last reward: 500.00 step: 500 Average reward: 453.20
Episode 430 Last reward: 500.00 step: 500 Average reward: 471.98
Solved. Running reward: 475.9764491024681, Last reward: 500