在gym/envs/toy_text/cliffwalking.py文件中
import numpy as np
import sys
from contextlib import closing
from io import StringIO
from gym.envs.toy_text import discrete
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
class CliffWalkingEnv(discrete.DiscreteEnv):
"""
The board is a 4x12 matrix, with (using NumPy matrix indexing):
[3, 0] as the start at bottom-left
[3, 11] as the goal at bottom-right
[3, 1..10] as the cliff at bottom-center
Each time step incurs -1 reward, and stepping into the cliff incurs -100 reward
and a reset to the start. An episode terminates when the agent reaches the goal.
"""
metadata = {"render.modes": ["human", "ansi"]}
def __init__(self):
self.shape = (4, 12)
self.start_state_index = np.ravel_multi_index((3, 0), self.shape)#3*12+0=36,将4*12二维空间坐标(3,0)转到该空间的一维位置
nS = np.prod(self.shape)#所有元素乘积,4*12=48,即48个状态
nA = 4 #4个动作可选
# Cliff Location
self._cliff = np.zeros(self.shape, dtype=np.bool)
self._cliff[3, 1:-1] = True #标记悬崖
# Calculate transition probabilities and rewards
P = {}
for s in range(nS):
position = np.unravel_index(s, self.shape)#从第0个位置,到47
P[s] = {a: [] for a in range(nA)}#每个位置有四种选择
P[s][UP] = self._calculate_transition_prob(position, [-1, 0])#给P矩阵赋值
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
P[s][LEFT] = self._calculate_transition_prob(position, [0, -1])
# Calculate initial state distribution
# We always start in state (3, 0)
isd = np.zeros(nS)
isd[self.start_state_index] = 1.0
super(CliffWalkingEnv, self).__init__(nS, nA, P, isd)
def _limit_coordinates(self, coord):
"""
Prevent the agent from falling out of the grid world
控制坐标在网格范围内
"""
coord[0] = min(coord[0], self.shape[0] - 1)
coord[0] = max(coord[0], 0)
coord[1] = min(coord[1], self.shape[1] - 1)
coord[1] = max(coord[1], 0)
return coord
def _calculate_transition_prob(self, current, delta):
"""
Determine the outcome for an action. Transition Prob is always 1.0.
"""
new_position = np.array(current) + np.array(delta)#(0,0)+(-1,0)=(-1,0)
new_position = self._limit_coordinates(new_position).astype(int)#(-1,0)->(0,0)
new_state = np.ravel_multi_index(tuple(new_position), self.shape)#转成二维空间的一维位置,1
if self._cliff[tuple(new_position)]:
return [(1.0, self.start_state_index, -100, False)]#人为确定悬崖reward为-100
terminal_state = (self.shape[0] - 1, self.shape[1] - 1)#结束状态为(3,11)
is_done = tuple(new_position) == terminal_state#判断是否结束
return [(1.0, new_state, -1, is_done)]#人为确定reward其他格子reward为-1
def render(self, mode="human"):
outfile = StringIO() if mode == "ansi" else sys.stdout
for s in range(self.nS):
position = np.unravel_index(s, self.shape)#(0,0)
if self.s == s:#当前位置
output = " x "
elif position == (3, 11):#终点
output = " T "
elif self._cliff[position]:#悬崖
output = " C "
else:
output = " o "
if position[1] == 0:#去掉第0列的左空格
output = output.lstrip()
if position[1] == self.shape[1] - 1:#去掉最后一列的右空格并换行
output = output.rstrip()
output += "\n"
outfile.write(output)
outfile.write("\n")
# No need to return anything for human
if mode != "human":
with closing(outfile):
return outfile.getvalue()
DiscreteEnv类在toy_text下
import numpy as np
from gym import Env, spaces
from gym.utils import seeding
def categorical_sample(prob_n, np_random):
"""
Sample from categorical distribution
Each row specifies class probabilities
"""
prob_n = np.asarray(prob_n)#([0,0,...,1,0])
csprob_n = np.cumsum(prob_n)
return (csprob_n > np_random.rand()).argmax()#36
class DiscreteEnv(Env):
"""
Has the following members
- nS: number of states
- nA: number of actions
- P: transitions (*)
- isd: initial state distribution (**)
(*) dictionary of lists, where
P[s][a] == [(probability, nextstate, reward, done), ...]
(**) list or array of length nS
"""
def __init__(self, nS, nA, P, isd):
self.P = P
self.isd = isd
self.lastaction = None # for rendering
self.nS = nS
self.nA = nA
self.action_space = spaces.Discrete(self.nA)
self.observation_space = spaces.Discrete(self.nS)
self.seed()
self.s = categorical_sample(self.isd, self.np_random)#得到当前状态位置,初始为36
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def reset(self):
self.s = categorical_sample(self.isd, self.np_random)
self.lastaction = None
return int(self.s)#重开一局,初始状态位置为36
def step(self, a):
transitions = self.P[self.s][a]
i = categorical_sample([t[0] for t in transitions], self.np_random)
p, s, r, d = transitions[i]#得到probability,nextstate,reward,done
self.s = s
self.lastaction = a
return (int(s), r, d, {"prob": p})
cartpole的代码在:gym\envs\classic_control\cartpole.py
在pycharm中按住ctrl然后点击你想看的那个类就会打开具体文档哦~