On-Policy蒙特卡洛
- 一、导入库
- 二、MDP
- 三、On-Policy蒙特卡洛控制
一、导入库
from typing import Dict, List, Optional, Tuple
import dataclasses
import numpy as np
二、MDP
@dataclasses.dataclass
class Transition:
state: Tuple[int, int]
action: str
next_state: Tuple[int, int]
reward: float
termination: bool
class GridEnv:
_states: np.array
_rewards: np.array
_action_semantics: List[str]
_actions: Dict[str, np.array]
_init_state: Tuple[int, int]
_current_state: Tuple[int, int]
_goal: Tuple[int, int]
_pits: Optional[List[Tuple[int, int]]]
_transition_probabilities: np.array
def __init__(self,
rows: int,
cols: int,
step_cost: float,
goal: Tuple[int, int],
pits: Optional[List[Tuple[int, int]]] = None,
obstacles: Optional[List[Tuple[int, int]]] = None) -> None:
self._states = np.zeros((rows, cols))
obstacles = [] if obstacles is None else obstacles
for r, c in obstacles:
self._states[r, c] = 1
self._rewards = -step_cost*np.ones((rows, cols))
self._rewards[goal[0], goal[1]] = +1
for r, c in pits:
self._rewards[r, c] = -1
self._action_semantics = ['up', 'left', 'down', 'right']
self._actions = np.array([[-1, 0], [0, -1], [1, 0], [0, 1]])
self._init_state = (rows - 1, 0)
self._current_state = self._init_state
self._goal = goal
self._pits = pits
self._transition_probabilities = np.array([0.1, 0.8, 0.1])
@property
def actions(self) -> List[str]:
return self._action_semantics
@property
def current_state(self) -> Tuple[int, int]:
return self._current_state
@property
def reward(self) -> float:
r, c = self._current_state
return self._rewards[r, c]
@property
def termination(self) -> bool:
return self._current_state == self._goal
def render(self) -> None:
grid = np.array(self._states, dtype=str)
r, c = self._current_state
grid[r, c] = 'x'
r, c = self._goal
if 'x' not in grid[r, c]:
grid[r, c] = 'G'
for r, c in self._pits:
grid[r, c] = 'P'
print(grid)
def _transition(self, state: Tuple[int, int], a: np.array) -> Tuple[int, int]:
n_actions = len(self._actions)
a = self._actions[a + n_actions if a < 0 else a % n_actions]
new_r = max(0, min(self._states.shape[0] - 1, state[0] + a[0]))
new_c = max(0, min(self._states.shape[1] - 1, state[1] + a[1]))
return (new_r, new_c) if self._states[new_r, new_c] == 0. else state
def step(self, action: str) -> Transition:
a_idx = self._action_semantics.index(action)
rnd = np.random.rand()
chosen_action = a_idx + np.random.choice([1, 0, -1], p=self._transition_probabilities)
prev_state = self._current_state
self._current_state = self._transition(self._current_state, chosen_action)
return Transition(state=prev_state,
action=action,
next_state=self._current_state,
reward=self.reward,
termination=self.termination)
def reset(self) -> None:
self._current_state = self._init_state
def state_space_size(self) -> Tuple[int, int]:
return self._states.shape
def action_space_size(self) -> int:
return len(self._actions)
my_env = GridEnv(rows=3, cols=4, step_cost=0.04, goal=(0, 3), pits=[(1, 3)], obstacles=[(1, 1)])
my_env.render()
print(my_env.current_state)
print(my_env.reward)
print(my_env.termination)
print(my_env.actions)
my_env.reset()
my_policy = ['up', 'up', 'up', 'right', 'right', 'right', 'right']
for a in my_policy:
print(a)
print(my_env.step(a))
my_env.render()
my_env.reset()
my_env.render()
三、On-Policy蒙特卡洛控制
state_space_size = np.prod(list(my_env.state_space_size()))
actions = my_env.actions
returns: Dict[int, Dict[str, List[float]]] = {i: {a: [] for a in actions}
for i in range(state_space_size)}
gamma = 0.99
epsilon = 0.1
def policy(eps: float, actions: List[str]) -> Dict[int, str]:
Q = {i: [np.average(adict[a]) if len(adict[a]) > 0 else np.random.rand() for a in actions]
for i, adict in returns.items()}
pi = {i: actions[np.argmax(Q[i])] if np.random.rand() >= eps
else actions[np.random.randint(len(actions))] for i in returns.keys()}
return pi
max_episodes = 3000
max_episode_length = 300
visited_states = {}
def preprocess_state(state: Tuple[int, int]) -> int:
if state not in visited_states:
visited_states[state] = len(visited_states)
return visited_states[state]
for ep in range(max_episodes):
if (ep) % 1000 == 0:
print(f'Episode {ep}')
my_env.reset()
pi = policy(0. if ep == max_episodes - 1 else epsilon, actions)
ret = 0
episode = []
visited_state_actions = []
for _ in range(max_episode_length):
state = preprocess_state(my_env.current_state)
action = pi[state]
visited_state_actions.append((state, action))
transition = my_env.step(action)
if ep == max_episodes - 1:
print(transition)
my_env.render()
episode.append(transition)
if transition.termination:
break
T = len(episode)
for i, step in enumerate(episode[::-1]):
state = preprocess_state(step.state)
ret = step.reward + gamma*ret
if (state, step.action) not in visited_state_actions[:T-i-1]:
returns[state][step.action].append(ret)