monte_carlo方法(maze代码实现)

首先,在上文的Maze方法中要实现随机采样的方法,这里采用了简单的随机采样方法(用 U ( 0 , 1 ) U(0,1) U(0,1)来模拟随机动作,这好像就是所谓的重要性采样方法吧)

# 续Maze
    # 随机取样
    def gen_randompi_sample(self, num):
        state_sample  = []
        action_sample = []
        reward_sample = []
        for _ in range(num):
                s_tmp = []
                a_tmp = []
                r_tmp = []

                s = self.states[int(random.random() * len(self.states))]
                t = False
                while False == t:
                    # 随机一个动作
                    actions = self.state_action(s)
                    a = actions[int(random.random() * len(actions))]

                    actions, s1, r  = self.transform(s,a)
                    s_tmp.append(s)
                    r_tmp.append(r)
                    a_tmp.append(a)
                    s = s1
                    t = self.is_done(s1)
                state_sample.append(s_tmp)
                reward_sample.append(r_tmp)
                action_sample.append(a_tmp)

        return state_sample, action_sample, reward_sample

MC方法的实现与运行

from env_maze import Maze
import numpy as np

class MC:
    # param state_sample
    # param action_sample
    # param reward_sample
    # 一次{状态-动作-奖励序列}的策略评估

    def mc(self, gamma, state_sample, action_sample, reward_sample, mdp_states):
        # 初始化所有的state v = 0 n = 0
        vfunc = dict()
        nfunc = dict()

        for s in mdp_states:
            vfunc[self.encode_state(s)] = 0.0
            nfunc[self.encode_state(s)] = 0.0

        # 计算过程
        for iter1 in range(len(state_sample)):
            G = 0.0
            for step in range(len(state_sample[iter1])-1,-1,-1): # 逆向
                G *= gamma
                G += reward_sample[iter1][step]
            for step in range(len(state_sample[iter1])): # 正向
                s = state_sample[iter1][step]
                vfunc[self.encode_state(s)] += G
                nfunc[self.encode_state(s)] += 1.0
                G -= reward_sample[iter1][step]
                G /= gamma
        for s in mdp_states: # 求均值
            if nfunc[self.encode_state(s)] > 0.000001:
                vfunc[self.encode_state(s)] /= nfunc[self.encode_state(s)]

        return vfunc

    # 解析动作
    def encode_state(self, state):
        return "%d_%d" % (state[0],state[1])

if __name__ == "__main__":
    env = Maze()
    worker = MC()

    state_sample, action_sample, reward_sample = env.gen_randompi_sample(10000)
    vfunc = worker.mc(0.2,state_sample, action_sample, reward_sample, env.states)
    print(vfunc)

(未完待续,接下来尝试MetropolisHastings 算法采样)

你可能感兴趣的:(强化学习)