基于模型的动态规划方法理论——策略迭代方法 and 值迭代方法(maze代码实现)

直接上代码
env_maze.py
修改后的迷宫环境,上次写的那个太乱了- -…乱的我头皮发麻。
每个state要有自己的动作空间,比如在[2,1]的位置,动作空间不能是[上,下,左,右],只能是[左,右],否则在搜索策略的时候会被那2个不存在的策略影响,导致收敛到一个错误的结果。
之前尝试碰壁和越界返回一个-1的返回值,结果发现在策略迭代的时候,刚开始的策略因为都是均匀随机的策略,有可能使策略 π s \pi_s πs开始的时候就是错误的,比如[3,5]位置是right,[5,5]位置是down,这样的结果就是4.5会向[1,1]处拟合,因为 v s = ( 3 , 5 ) v_{s=(3,5)} vs=(3,5) >> v s = ( 5 , 5 ) v_{s=(5,5)} vs=(5,5)当路径上的错误策略(碰壁-1这种)累计到一定程度时,会导致错误的收敛。

import numpy as np
import copy

class Maze():
    def __init__(self):
        # 迷宫
        self.block = np.asarray(
        # 0 1 2 3 4 5 6
        [[1,1,1,1,1,1,1]
        ,[1,0,0,0,0,0,1]
        ,[1,0,1,0,0,0,1]
        ,[1,0,1,0,1,1,1]
        ,[1,0,1,0,1,2,1]
        ,[1,0,1,0,0,0,1]
        ,[1,1,1,1,1,1,1]]).T

        # state
        self.state = np.zeros(2, dtype = int)

        # gamma 折扣因子
        self.gamma = 0.8

        # 状态空间
        self.states = None
        self.create_states()
    
    # 构建状态空间
    def create_states(self):
        self.states = []
        for i in range(0,6):
            for j in range(0,6):
                if self.block[i][j] != 1:
                    self.states.append(np.asarray([i,j]))
        
    # 获取state的动作空间        
    def state_action(self, state):
        actions = []
        if self.block[state[0]-1][state[1]] != 1:
            actions.append("left")
        if self.block[state[0]+1][state[1]] != 1:
            actions.append("right")
        if self.block[state[0]][state[1]-1] != 1:
            actions.append("up")
        if self.block[state[0]][state[1]+1] != 1:
            actions.append("down")
        return actions
    
    # 状态转移
    def transform(self, state, action):
        new_state = state.copy()
        actions = self.state_action(state)
        if action not in actions:
            print(state)
            print(action)
            print(actions)
            return state
        if action == "left":
            new_state[0] -= 1
        elif action == "right":
            new_state[0] += 1
        elif action == "up":
            new_state[1] -= 1
        elif action == "down":
            new_state[1] += 1
        return actions, new_state, self.r(new_state)
    
    # 初始化
    def reset(self):
        # 随机1~5 to x,y
        x = np.random.randint(1, 6)
        y = np.random.randint(1, 6)
        if self.block[x][y] == 1:
            self.reset()
        self.state[0] = x
        self.state[1] = y
        return self.state

    def step(self, action):
        _, state, r = self.transform(self.state, action)
        self.state = state
        return self.state, r, self.is_done(self.state)
    
    def is_done(self, state):
        if self.block[state[0]][state[1]] == 2:
            return True
        return False
    
    def r(self, state):
        if self.block[state[0]][state[1]] == 2:
            return 1
        return 0

    def render(self):
        print(self.state)

策略迭代方法

policy_iteration_method.py

import random
import numpy as np

class PIM:
    def __init__(self):
        self.pi = dict()
        self.v = dict()


    def create(self, mdp):
        for state in mdp.states:
            self.v[self.encode_state(state)] = 0
            # 随机策略
            action = random.choice(mdp.state_action(state))
            self.pi[self.encode_state(state)] = action
        
        

    def policy_iteration(self, mdp, interation):
        for _ in range(interation):
            self.policy_evaluate(mdp)
            self.policy_improve(mdp)

    # 策略评估
    def policy_evaluate(self,mdp):
        for _ in range(1000):
            delta = 0.0
            # 遍历 states
            for state in mdp.states:
                # 判断是否是终止条件
                if mdp.is_done(state):        
                    continue
                # 获取当前state的最佳策略
                action = self.pi[self.encode_state(state)]

                # 状态通过action转移
                _, s, r = mdp.transform(state, action)
                # v = r + gamma * v'
                new_v = r + mdp.gamma * self.v[self.encode_state(s)]
                # 计算delta += v - v'
                delta += abs(self.v[self.encode_state(state)] - new_v)
                # 更新(迭代) v
                self.v[self.encode_state(state)] = new_v
            # 当 delta 小于 1e-6 当作v = v'
            if delta < 1e-6:
                break

    # 策略改善
    def policy_improve(self,mdp):
        # 遍历 states
        for state in mdp.states:
            # 判断是否是终止条件
            if mdp.is_done(state):        
                continue
            actions = mdp.state_action(state)
            # 取第一个策略
            a1 = actions[0]
            
            # 状态通过action转移
            t, s, r = mdp.transform(state, a1)
            # v = r + gamma * v'
            v1 = r + mdp.gamma * self.v[self.encode_state(s)]

            #如果策略无效,v1 = 极小值

            # 尝试其他策略
            for action in actions:
                # 状态通过action转移                 
                t, s, r = mdp.transform(state, action)
                # 当其他策略的v更高时 更新策略
                if v1 < r + mdp.gamma * self.v[self.encode_state(s)]:
                    a1 = action
                    v1 = r+mdp.gamma * self.v[self.encode_state(s)]
            # 更新当前stage的最佳策略
            self.pi[self.encode_state(state)] = a1

    def action(self, state):
        return self.pi[self.encode_state(state)]

    # 解析动作
    def encode_state(self, state):
        return "%d_%d" % (state[0],state[1])

run_PIM.py

from policy_iteration_method import PIM
from env_maze import Maze

if __name__ == "__main__":

    # 导入环境
    env = Maze()

    state = env.reset()

    worker = PIM()
    worker.create(env)

    worker.policy_iteration(env,100)
    
    print(worker.v)
    print(worker.pi)

迭代100次的结果
{‘1_1’: 0.1677721600000001, ‘1_2’: 0.13421772800000006, ‘1_3’: 0.10737418240000006, ‘1_4’: 0.08589934592000005, ‘1_5’: 0.06871947673600004, ‘2_1’: 0.2097152000000001, ‘3_1’: 0.2621440000000001, ‘3_2’: 0.32768000000000014, ‘3_3’: 0.40960000000000013, ‘3_4’: 0.5120000000000001, ‘3_5’: 0.6400000000000001, ‘4_1’: 0.2097152000000001, ‘4_2’: 0.2621440000000001, ‘4_5’: 0.8, ‘5_1’: 0.1677721600000001, ‘5_2’: 0.2097152000000001, ‘5_4’: 0, ‘5_5’: 1.0}

{‘1_1’: ‘right’, ‘1_2’: ‘up’, ‘1_3’: ‘up’, ‘1_4’: ‘up’, ‘1_5’: ‘up’, ‘2_1’: ‘right’, ‘3_1’: ‘down’, ‘3_2’: ‘down’, ‘3_3’: ‘down’, ‘3_4’: ‘down’, ‘3_5’: ‘right’, ‘4_1’: ‘left’, ‘4_2’: ‘left’, ‘4_5’: ‘right’, ‘5_1’: ‘left’, ‘5_2’: ‘left’, ‘5_4’: ‘none’, ‘5_5’: ‘up’}

值迭代方法

value_iteration_method.py

import random
import numpy as np

class VIM:
    def __init__(self):
        self.pi = dict()
        self.v = dict()


    def create(self, mdp):
        for state in mdp.states:
            self.v[self.encode_state(state)] = 0
            # 随机策略
            action = random.choice(mdp.state_action(state))
            self.pi[self.encode_state(state)] = action

    def iteration(self,mdp,interation):
        for _ in range(interation):
            self.value_iteration(mdp)
    

    def value_iteration(self, mdp):
        for _ in range(1000):
            delta = 0.0
            for state in mdp.states:
                if mdp.is_done(state):        
                    continue
                actions = mdp.state_action(state)
                # 取第一个策略
                a1 = actions[0]

                 # 状态通过action转移
                t, s, r = mdp.transform(state, a1)
                # v = r + gamma * v'
                v1 = r + mdp.gamma * self.v[self.encode_state(s)]

                # 尝试其他策略
                for action in actions:
                    # 状态通过action转移                 
                    t, s, r = mdp.transform(state, action)
                    # 当其他策略的v更高时 更新策略
                    if v1 < r + mdp.gamma * self.v[self.encode_state(s)]:
                        a1 = action
                        v1 = r+mdp.gamma * self.v[self.encode_state(s)]
                delta += abs(v1-self.v[self.encode_state(s)])
                self.pi[self.encode_state(state)] = a1
                self.v[self.encode_state(state)] = v1
            if delta < 1e-6:
                break


    # 解析动作
    def encode_state(self, state):
        return "%d_%d" % (state[0],state[1])

run_VIM.py

from value_iteration_method import VIM
from env_maze import Maze

if __name__ == "__main__":

    # 导入环境
    env = Maze()

    state = env.reset()

    worker = VIM()
    worker.create(env)

    worker.iteration(env,100)
    
    print(worker.v)
    print(worker.pi)

迭代100次结果
{‘1_1’: 0.1677721600000001, ‘1_2’: 0.13421772800000006, ‘1_3’: 0.10737418240000006, ‘1_4’: 0.08589934592000005, ‘1_5’: 0.06871947673600004, ‘2_1’: 0.2097152000000001, ‘3_1’: 0.2621440000000001, ‘3_2’: 0.32768000000000014, ‘3_3’: 0.40960000000000013, ‘3_4’: 0.5120000000000001, ‘3_5’: 0.6400000000000001, ‘4_1’: 0.2097152000000001, ‘4_2’: 0.2621440000000001, ‘4_5’: 0.8, ‘5_1’: 0.1677721600000001, ‘5_2’: 0.2097152000000001, ‘5_4’: 0, ‘5_5’: 1.0}
{‘1_1’: ‘right’, ‘1_2’: ‘up’, ‘1_3’: ‘up’, ‘1_4’: ‘up’, ‘1_5’: ‘up’, ‘2_1’: ‘right’, ‘3_1’: ‘down’, ‘3_2’: ‘down’, ‘3_3’: ‘down’, ‘3_4’: ‘down’, ‘3_5’: ‘right’, ‘4_1’: ‘left’, ‘4_2’: ‘left’, ‘4_5’: ‘right’, ‘5_1’: ‘left’, ‘5_2’: ‘left’, ‘5_4’: ‘none’, ‘5_5’: ‘up’}

确实可以找到出口,比DQN要快很多,不过缺点就是好像得有全部的状态空间。

你可能感兴趣的:(强化学习)