[RL]On-Policy蒙特卡洛控制

On-Policy蒙特卡洛

  • 一、导入库
  • 二、MDP
  • 三、On-Policy蒙特卡洛控制


一、导入库

from typing import Dict, List, Optional, Tuple

import dataclasses
import numpy as np

二、MDP

@dataclasses.dataclass
class Transition:
    state: Tuple[int, int]
    action: str
    next_state: Tuple[int, int]
    reward: float
    termination: bool
        

class GridEnv:
    _states: np.array
    _rewards: np.array
    _action_semantics: List[str]
    _actions: Dict[str, np.array]
    _init_state: Tuple[int, int]
    _current_state: Tuple[int, int]
    _goal: Tuple[int, int]
    _pits: Optional[List[Tuple[int, int]]]
    _transition_probabilities: np.array
        
    def __init__(self, 
                 rows: int, 
                 cols: int,
                 step_cost: float,
                 goal: Tuple[int, int], 
                 pits: Optional[List[Tuple[int, int]]] = None,
                 obstacles: Optional[List[Tuple[int, int]]] = None) -> None:
        self._states = np.zeros((rows, cols))
        
        obstacles = [] if obstacles is None else obstacles
        
        for r, c in obstacles:
            self._states[r, c] = 1
            
        self._rewards = -step_cost*np.ones((rows, cols))
        self._rewards[goal[0], goal[1]] = +1
        
        for r, c in pits:
            self._rewards[r, c] = -1
        
        self._action_semantics = ['up', 'left', 'down', 'right']
        self._actions = np.array([[-1, 0], [0, -1], [1, 0], [0, 1]])
        self._init_state = (rows - 1, 0)
        self._current_state = self._init_state
        self._goal = goal
        self._pits = pits
        
        # going right, straight, left wrt chosen action
        self._transition_probabilities = np.array([0.1, 0.8, 0.1])
        
    @property
    def actions(self) -> List[str]:
        return self._action_semantics
    
    @property
    def current_state(self) -> Tuple[int, int]:
        return self._current_state
    
    @property
    def reward(self) -> float:
        r, c = self._current_state
        return self._rewards[r, c]
    
    @property
    def termination(self) -> bool:
        return self._current_state == self._goal
    
    def render(self) -> None:
        grid = np.array(self._states, dtype=str)
        r, c = self._current_state
        grid[r, c] = 'x'
        
        r, c = self._goal
        
        if 'x' not in grid[r, c]:
            grid[r, c] = 'G'
        
            for r, c in self._pits:
                grid[r, c] = 'P'

        print(grid)
        
    def _transition(self, state: Tuple[int, int], a: np.array) -> Tuple[int, int]:
        n_actions = len(self._actions)
        a = self._actions[a + n_actions if a < 0 else a % n_actions]
        new_r = max(0, min(self._states.shape[0] - 1, state[0] + a[0]))
        new_c = max(0, min(self._states.shape[1] - 1, state[1] + a[1]))
        return (new_r, new_c) if self._states[new_r, new_c] == 0. else state
        
    def step(self, action: str) -> Transition:
        a_idx = self._action_semantics.index(action)
        
        rnd = np.random.rand()
        chosen_action = a_idx + np.random.choice([1, 0, -1], p=self._transition_probabilities)
        prev_state = self._current_state
        self._current_state = self._transition(self._current_state, chosen_action)
        return Transition(state=prev_state,
                          action=action,
                          next_state=self._current_state,
                          reward=self.reward,
                          termination=self.termination)
    
    def reset(self) -> None:
        self._current_state = self._init_state
        
    def state_space_size(self) -> Tuple[int, int]:
        return self._states.shape
    
    def action_space_size(self) -> int:
        return len(self._actions)
my_env = GridEnv(rows=3, cols=4, step_cost=0.04, goal=(0, 3), pits=[(1, 3)], obstacles=[(1, 1)])
my_env.render()
print(my_env.current_state)
print(my_env.reward)
print(my_env.termination)
print(my_env.actions)
my_env.reset()
my_policy = ['up', 'up', 'up', 'right', 'right', 'right', 'right']
for a in my_policy:
    print(a)
    print(my_env.step(a))
    my_env.render()
my_env.reset()
my_env.render()

三、On-Policy蒙特卡洛控制

state_space_size = np.prod(list(my_env.state_space_size()))
actions = my_env.actions

returns: Dict[int, Dict[str, List[float]]] = {i: {a: [] for a in actions} 
                                              for i in range(state_space_size)}
gamma = 0.99
epsilon = 0.1

def policy(eps: float, actions: List[str]) -> Dict[int, str]:
    Q = {i: [np.average(adict[a]) if len(adict[a]) > 0 else np.random.rand() for a in actions]
         for i, adict in returns.items()}
    pi = {i: actions[np.argmax(Q[i])] if np.random.rand() >= eps 
          else actions[np.random.randint(len(actions))] for i in returns.keys()}
    return pi
    

max_episodes = 3000
max_episode_length = 300
visited_states = {}

def preprocess_state(state: Tuple[int, int]) -> int:
    if state not in visited_states:
        visited_states[state] = len(visited_states)
        
    return visited_states[state]

for ep in range(max_episodes):
    if (ep) % 1000 == 0:
        print(f'Episode {ep}')
        
    my_env.reset()
    pi = policy(0. if ep == max_episodes - 1 else epsilon, actions)
    
    ret = 0
    episode = []
    visited_state_actions = []
    
    for _ in range(max_episode_length):
        state = preprocess_state(my_env.current_state)
        action = pi[state]        
        visited_state_actions.append((state, action))
        transition = my_env.step(action)
        
        if ep == max_episodes - 1:
            print(transition)
            my_env.render()
        
        episode.append(transition)
        
        if transition.termination:
            break
            
    T = len(episode)
    for i, step in enumerate(episode[::-1]):
        state = preprocess_state(step.state)
        ret = step.reward + gamma*ret
        if (state, step.action) not in visited_state_actions[:T-i-1]:
            returns[state][step.action].append(ret)

你可能感兴趣的:(机器学习,强化学习,马尔可夫链蒙特卡洛方法,python)