import gym
import random
import time
env = gym.make('GridWorld-v0')
class Learn:
def __init__(self,grid_mdp):
# 初始化状态值函数
self.v = dict()#是个字典
for state in grid_mdp.states:
self.v[state]=0
# 初始化策略,这些策略都在状态转移概率矩阵中
self.pi=dict()
# random.choice(seq):返回列表、元组、字符串的随机项
self.pi[1]=random.choice(['e','s'])
self.pi[2]=random.choice(['w','e'])
self.pi[3]=random.choice(['w','s','e'])
self.pi[4]=random.choice(['w','e'])
self.pi[5]=random.choice(['w','s'])
# 策略迭代算法
def policy_iterate(self,grid_mdp):
for i in range(100):
# policy evaluation
self.policy_evaluate(grid_mdp);
# policy improvement
self.policy_improve(grid_mdp);
# 策略评估算法
def policy_evaluate(self,grid_mdp):
# 迭代1000次保证策略收敛
for i in range(1000):
delta = 0.0
# 对整个状态空间扫描,保证状态空间每一个点的值函数都被估计
for state in grid_mdp.states:
if state in grid_mdp.terminate_states:
continue
action = self.pi[state]
t,s,r = grid_mdp.transform(state,action)
new_v = r +grid_mdp.gamma * self.v[s]
delta += abs(self.v[state]-new_v)
self.v[state] = new_v
if delta <1e-6:
break
# 策略改善算法
# 基于当前的值函数得到贪婪策略,将贪婪策略作为更新的策略
def policy_improve(self,grid_mdp):
# 对整个状态空间遍历
for state in grid_mdp.states:
if state in grid_mdp.terminate_states: continue
a1 = self.pi[state]
t,s,r = grid_mdp.transform(state,a1)
# 当不在状态转移概率中时,状态动作值函数不存在,状态值函数不变
# if s!=-1:
v1 = r+grid_mdp.gamma*self.v[s]
# 对每个状态空间所对应的动作空间遍历
# 贪婪策略
for action in grid_mdp.actions:
t,s,r = grid_mdp.transform(state,action)
if s != -1:
if v1<r+grid_mdp.gamma*self.v[s]:
a1 = action
v1 = r +grid_mdp.gamma*self.v[s]
# 更新策略
self.pi[state] = a1
# 最优动作
def action(self,state):
return self.pi[state]
gm = env.env
# 初始化智能体状态
state = env.reset()
# 实例化对象,获得初始化状态值函数和初始化策略
learn=Learn(gm)
#策略评估和策略改善
learn.policy_iterate(gm)
total_reward = 0
#最多走100步达到终止状态
for i in range(100):
env.render()
#每一个状态的策略都是最优策略
action = learn.action(state)
# 每一步按照最优策略走
state,reward,done,_ = env.step(action)
total_reward+=reward
time.sleep(1)
if done:
# 显示环境中物体进入终止状态的图像
env.render()
break
grid_mdp.py
import logging #日志模块
import numpy
import random
from gym import spaces
import gym
logging = logging.getLogger(__name__)
# Set this in SOME subclasses
class GridEnv(gym.Env):
metadata = {
'render.modes':['human','rgb_array'],
'video.frames_per_second':2
}
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def __init__(self):
# 状态空间
self.states = [1,2,3,4,5,6,7,8]
# 设置机器人的位置,机器人的位置根据所处状态不同位置也不同。
# 事先计算出每个状态点机器人位置的中心坐标,并存储到两个向量中,在类初始化中给出
self.x = [140, 220, 300, 380, 460, 140, 300, 460]
self.y = [250, 250, 250, 250, 250, 150, 150, 150]
self.terminate_states = dict() #终止状态为字典格式
self.terminate_states[6] = 1
self.terminate_states[7] = 1
self.terminate_states[8] = 1
# 动作空间
self.actions = ['n','e','s','w']
# 回报函数
self.rewards = dict(); #回报的数据结构为字典
self.rewards['1_s'] = -1.0
self.rewards['3_s'] = 1.0
self.rewards['5_s'] = -1.0
# 状态转移概率
self.t = dict();
self.t['1_s'] = 6
self.t['1_e'] = 2
self.t['2_w'] = 1
self.t['2_e'] = 3
self.t['3_s'] = 7
self.t['3_w'] = 2
self.t['3_e'] = 4
self.t['4_w'] = 3
self.t['4_e'] = 5
self.t['5_w'] = 8
self.t['5_w'] = 4
#折扣因子
self.gamma = 0.8
self.viewer = None
self.state = None
# 返回下一步状态、立即回报和状态转移概率
def transform(self,state,action):
#遍历动作空间,当不再状态转移概率中,该状态设为-1
s = -1
r = 0
key = '%d_%s'%(state,action)
if key in self.rewards:
r = self.rewards[key]
# 状态转移
if key in self.t:
s = self.t[key]
return self.t,s,r
def getTerminal(self):
return self.terminate_states
def getGamma(self):
return self.gamma
def getStates(self):
return self.states
def getActions(self):
return self.actions
def setAction(self,s):
self.state = s
# reset函数建立
# reset常常用随机的方法初始机器人状态
def reset(self):
self.state = self.states[int(random.random() * len(self.states))]
return self.state
# step()函数输入是动作,
# 输出是下一时刻的动作、回报、是否终止、调试信息
# 没有的用{}表示
def step(self,action):
# 系统当前状态
state = self.state
# 判断系统当前状态是否为终止状态
if state in self.terminate_states:
return state,0,True,{}
#将状态与动作组成字典的键值
# '定义的格式化字符串'%实际值
# 当定义的格式化字符串中包含两个以上占位符时,必须将所有实际值封装在元组中
key = '%d_%s'%(state,action)
#状态转移
if key in self.t:
next_state = self.t[key]
else:
next_state = state
# 系统当前状态
self.state = next_state
is_terminal = False
if next_state in self.terminate_states:
is_terminal = True
if key not in self.rewards:
r = 0.0
else:
r = self.rewards[key]
return next_state,r,is_terminal,{}
# render函数建立,扮演图像引擎的角色,图像引擎显示环境中物体的图像
def render(self, mode='human', close=False):
if close==True:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return
screen_width = 600
screen_height = 400
if self.viewer is None:
# 调用rendering的画图函数
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
# 创建网格世界,一共11条直线
self.line1 = rendering.Line((100, 300), (500, 300))
self.line2 = rendering.Line((100, 200), (500, 200))
self.line3 = rendering.Line((100, 300), (100, 100))
self.line4 = rendering.Line((180, 300), (180, 100))
self.line5 = rendering.Line((260, 300), (260, 100))
self.line6 = rendering.Line((340, 300), (340, 100))
self.line7 = rendering.Line((420, 300), (420, 100))
self.line8 = rendering.Line((500, 300), (500, 100))
self.line9 = rendering.Line((100, 100), (180, 100))
self.line10 = rendering.Line((260, 100), (340, 100))
self.line11 = rendering.Line((420, 100), (500, 100))
# 接下来写死亡区域,黑色实心圆代表死亡区域
# 创建第一个骷髅
self.kulo1 = rendering.make_circle(40)
self.circletrans = rendering.Transform(translation=(140, 150)) # 圆心坐标
self.kulo1.add_attr(self.circletrans)
self.kulo1.set_color(0, 0, 0)
# 创建第二个骷髅
self.kulo2 = rendering.make_circle(40)
self.circletrans = rendering.Transform(translation=(460, 150))
self.kulo2.add_attr(self.circletrans)
self.kulo2.set_color(0, 0, 0)
# 创建金币区域,用浅色的圆表示
self.gold = rendering.make_circle(40)
self.circletrans = rendering.Transform(translation=(300, 150))
self.gold.add_attr(self.circletrans)
self.gold.set_color(1, 0.9, 0)
# 创建机器人,用不同颜色的圆表示
self.robot = rendering.make_circle(30)
self.robotrans = rendering.Transform()
self.robot.add_attr(self.robotrans)
self.robot.set_color(0.8, 0.6, 0.4)
# 给11条直线设置颜色,并将这些创建的对象添加到几何中
self.line1.set_color(0, 0, 0)
self.line2.set_color(0, 0, 0)
self.line3.set_color(0, 0, 0)
self.line4.set_color(0, 0, 0)
self.line5.set_color(0, 0, 0)
self.line6.set_color(0, 0, 0)
self.line7.set_color(0, 0, 0)
self.line8.set_color(0, 0, 0)
self.line9.set_color(0, 0, 0)
self.line10.set_color(0, 0, 0)
self.line11.set_color(0, 0, 0)
self.viewer.add_geom(self.line1)
self.viewer.add_geom(self.line2)
self.viewer.add_geom(self.line3)
self.viewer.add_geom(self.line4)
self.viewer.add_geom(self.line5)
self.viewer.add_geom(self.line6)
self.viewer.add_geom(self.line7)
self.viewer.add_geom(self.line8)
self.viewer.add_geom(self.line9)
self.viewer.add_geom(self.line10)
self.viewer.add_geom(self.line11)
self.viewer.add_geom(self.kulo1)
self.viewer.add_geom(self.kulo2)
self.viewer.add_geom(self.gold)
self.viewer.add_geom(self.robot)
# 根据这两个向量和机器人当前状态,就可以设置机器人当前的圆心坐标
if self.state is None: return None
self.robotrans.set_translation(self.x[self.state - 1], self.y[self.state - 1])
return self.viewer.render(return_rgb_array=mode =='rgb_array')
具体实现见上一节
import gym
import random
import time
env = gym.make('GridWorld-v0')
class Learn:
def __init__(self,grid_mdp):
# 初始化状态值函数
self.v = dict()#是个字典
for state in grid_mdp.states:
self.v[state]=0
# 初始化策略,这些策略都在状态转移概率矩阵中
self.pi=dict()
# random.choice(seq):返回列表、元组、字符串的随机项
self.pi[1]=random.choice(['e','s'])
self.pi[2]=random.choice(['w','e'])
self.pi[3]=random.choice(['w','s','e'])
self.pi[4]=random.choice(['w','e'])
self.pi[5]=random.choice(['w','s'])
# 值迭代
def value_iteration(self, grid_mdp):
# 大循环保证值函数收敛
for i in range(1000):
delta = 0.0
# 遍历整个状态空间
for state in grid_mdp.states:
if state in grid_mdp.terminate_states: continue
a1 = grid_mdp.actions[0]
t, s, r = grid_mdp.transform(state, a1)
if s != -1:
v1 = r + grid_mdp.gamma * self.v[s]
# 遍历动作空间选择动作
for action in grid_mdp.actions:
t, s, r = grid_mdp.transform(state, action)
if s != -1:
if v1 < r + grid_mdp.gamma * self.v[s]:
a1 = action
v1 = r + grid_mdp.gamma * self.v[s]
delta += abs(v1 - self.v[state])
self.pi[state] = a1
self.v[state] = v1
if delta < 1e-6:
break
# 最优动作
def action(self,state):
return self.pi[state]
gm = env.env
# 初始化智能体状态
state = env.reset()
# 实例化对象,获得初始化状态值函数和初始化策略
learn=Learn(gm)
#策略评估和策略改善
learn.value_iteration(gm)
total_reward = 0
#最多走100步达到终止状态
for i in range(100):
env.render()
#每一个状态的策略都是最优策略
action = learn.action(state)
# 每一步按照最优策略走
state,reward,done,_ = env.step(action)
total_reward+=reward
time.sleep(1)
if done:
# 显示环境中物体进入终止状态的图像
env.render()
break
import gym
import random
import time
env = gym.make('MazeWorld-v0')
class Learn:
def __init__(self,maze_mdp):
# 初始化状态值函数
self.v = dict()#是个字典
for state in maze_mdp.states:
self.v[state]=0
# 初始化策略,这些策略都在状态转移概率矩阵中
self.pi=dict()
# random.choice(seq):返回列表、元组、字符串的随机项
self.pi[1] = random.choice(['s', 'e'])
self.pi[2] = random.choice(['w', 'e', 's'])
self.pi[3] = random.choice(['w', 's'])
self.pi[4] = random.choice(['s'])
self.pi[5] = random.choice(['e', 'n'])
self.pi[6] = random.choice(['w', 'e', 'n'])
self.pi[7] = random.choice(['w', 'n', 's'])
self.pi[8] = random.choice(['e', 'n', 's'])
self.pi[9] = random.choice(['e', 'n', 's'])
self.pi[10] = random.choice(['w', 'e', 's'])
self.pi[11] = random.choice(['w', 'n', 's'])
self.pi[12] = random.choice(['e', 's'])
self.pi[13] = random.choice(['w', 'e', 's'])
self.pi[14] = random.choice(['w', 'e', 'n'])
self.pi[15] = random.choice(['w', 'e', 'n'])
self.pi[16] = random.choice(['w', 'e', 'n'])
self.pi[17] = random.choice(['e', 'n'])
self.pi[18] = random.choice(['w', 'n'])
# 策略迭代算法
def policy_iterate(self,maze_mdp):
for i in range(100):
# policy evaluation
self.policy_evaluate(maze_mdp);
# policy improvement
self.policy_improve(maze_mdp);
# 策略评估算法
def policy_evaluate(self,maze_mdp):
# 迭代1000次保证策略收敛
for i in range(1000):
delta = 0.0
# 对整个状态空间扫描,保证状态空间每一个点的值函数都被估计
for state in maze_mdp.states:
if state in maze_mdp.terminate_states:
continue
action = self.pi[state]
t,s,r = maze_mdp.transform(state,action)
if s!=-1:
new_v = r +maze_mdp.gamma * self.v[s]
delta += abs(self.v[state]-new_v)
self.v[state] = new_v
if delta <1e-6:
break
# 策略改善算法
# 基于当前的值函数得到贪婪策略,将贪婪策略作为更新的策略
def policy_improve(self,maze_mdp):
# 对整个状态空间遍历
for state in maze_mdp.states:
if state in maze_mdp.terminate_states: continue
a1 = self.pi[state]
t,s,r = maze_mdp.transform(state,a1)
# 当不在状态转移概率中时,状态动作值函数不存在,状态值函数不变
if s!=-1:
v1 = r+maze_mdp.gamma*self.v[s]
# 对每个状态空间所对应的动作空间遍历
# 贪婪策略
for action in maze_mdp.actions:
t,s,r = maze_mdp.transform(state,action)
if s != -1:
if v1<r+maze_mdp.gamma*self.v[s]:
a1 = action
v1 = r +maze_mdp.gamma*self.v[s]
# 更新策略
self.pi[state] = a1
# 最优动作
def action(self,state):
return self.pi[state]
gm = env.env
# 初始化智能体状态
state = env.reset()
# 实例化对象,获得初始化状态值函数和初始化策略
learn=Learn(gm)
#策略评估和策略改善
learn.policy_iterate(gm)
total_reward = 0
#最多走100步达到终止状态
for i in range(100):
env.render()
#每一个状态的策略都是最优策略
action = learn.action(state)
# 每一步按照最优策略走
state,reward,done,_ = env.step(action)
total_reward+=reward
time.sleep(1)
if done:
# 显示环境中物体进入终止状态的图像
env.render()
break
maze_mdp.py
import logging #日志模块
import numpy
import random
from gym import spaces
import gym
logging = logging.getLogger(__name__)
# Set this in SOME subclasses
class MazeEnv(gym.Env):
metadata = {
'render.modes':['human','rgb_array'],
'video.frames_per_second':2
}
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def __init__(self):
# 状态空间
self.states = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
# 设置机器人的位置,机器人的位置根据所处状态不同位置也不同。
# 事先计算出每个状态点机器人位置的中心坐标,并存储到两个向量中,在类初始化中给出
self.x = [125, 175, 225, 325, 125, 175, 225, 325, 225, 275, 325, 125, 175, 225, 275, 325, 125, 175]
self.y = [325, 325, 325, 325, 275, 275, 275, 275, 225, 225, 225, 175, 175, 175, 175, 175, 125, 125]
self.terminate_states = dict() #终止状态为字典格式
self.terminate_states[11] = 1
# 动作空间
self.actions = ['n','e','s','w']
# 回报函数
self.rewards = dict(); #回报的数据结构为字典
self.rewards['18_s'] = 20.0
self.rewards['10_e'] = 20.0
self.rewards['16_n'] = 20.0
# 状态转移概率
self.t = dict();
self.t['1_e'] = 2
self.t['1_s'] = 5
self.t['2_w'] = 1
self.t['2_e'] = 3
self.t['2_s'] = 6
self.t['3_w'] = 2
self.t['3_s'] = 7
self.t['4_s'] = 8
self.t['5_e'] = 6
self.t['5_n'] = 1
self.t['6_w'] = 5
self.t['6_e'] = 7
self.t['6_n'] = 2
self.t['7_w'] = 6
self.t['7_s'] = 9
self.t['7_n'] = 3
self.t['8_s'] = 11
self.t['8_n'] = 4
self.t['9_e'] = 10
self.t['9_s'] = 14
self.t['9_n'] = 7
self.t['10_w'] = 9
self.t['10_e'] = 11
self.t['10_s'] = 15
self.t['11_w'] = 10
self.t['11_s'] = 16
self.t['11_n'] = 8
self.t['12_e'] = 13
self.t['12_s'] = 17
self.t['13_w'] = 12
self.t['13_e'] = 14
self.t['13_s'] = 18
self.t['14_w'] = 13
self.t['14_e'] = 15
self.t['14_n'] = 9
self.t['15_w'] = 14
self.t['15_e'] = 16
self.t['15_n'] = 10
self.t['16_w'] = 15
self.t['16_n'] = 11
self.t['17_e'] = 18
self.t['17_n'] = 12
self.t['18_w'] = 17
self.t['18_n'] = 13
#折扣因子
self.gamma = 0.8
self.viewer = None
self.state = None
# 返回下一步状态、立即回报和状态转移概率
def transform(self, state, action):
# 遍历动作空间,当不再状态转移概率中,该状态设为-1
s = -1
r = 0
key = '%i_%s' % (state, action)
if key in self.rewards:
r = self.rewards[key]
# 状态转移
if key in self.t:
s = self.t[key]
return self.t, s, r
def getTerminal(self):
return self.terminate_states
def getGamma(self):
return self.gamma
def getStates(self):
return self.states
def getAction(self):
return self.actions
def getTerminate_states(self):
return self.terminate_states
def setAction(self,s):
self.state = s
# step()函数输入是动作,
# 输出是下一时刻的动作、回报、是否终止、调试信息
# 没有的用{}表示
def _step(self,action):
# 系统当前状态
state = self.state
# 判断系统当前状态是否为终止状态
if state in self.terminate_states:
return state,0,True,{}
#将状态与动作组成字典的键值
key = "%d_%s"%(state,action)
#状态转移
if key in self.t:
next_state = self.t[key]
else:
next_state = state
self.state = next_state
is_terminal = False
if next_state in self.terminate_states:
is_terminal = True
if key not in self.rewards:
r = -1.0
else:
r = self.rewards[key]
return next_state,r,is_terminal,{}
# reset函数建立
# reset常常用随机的方法初始机器人状态
def _reset(self):
self.state = self.states[int(random.random()*len(self.states))]
return self.state
# render函数建立
def _render(self, mode='human', close=False):
if close:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return
screen_width = 450
screen_height = 450
if self.viewer is None:
# 调用rendering的画图函数
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
# 创建网格世界,一共12条直线
self.line1 = rendering.Line((100, 350), (350, 350))
self.line2 = rendering.Line((100, 300), (350, 300))
self.line3 = rendering.Line((100, 250), (350, 250))
self.line4 = rendering.Line((100, 200), (350, 200))
self.line5 = rendering.Line((100, 150), (350, 150))
self.line6 = rendering.Line((100, 100), (350, 100))
self.line7 = rendering.Line((100, 350), (100, 100))
self.line8 = rendering.Line((150, 350), (150, 100))
self.line9 = rendering.Line((200, 350), (200, 100))
self.line10 = rendering.Line((250, 350), (250, 100))
self.line11 = rendering.Line((300, 350), (300, 100))
self.line12 = rendering.Line((350, 350), (350, 100))
# 接下来写墙块,黑色填满代表墙块
# 创建第一个墙块
self.wall1 = rendering.make_polygon([(250,350),(300,350), (300,300),(250,300)],filled=True)
self.wall1.set_color(0, 0, 0)
# 创建第二个墙块
self.wall2 = rendering.make_polygon([(250,300),(300,300),(300,250),(250,250)],filled=True)
self.wall2.set_color(0, 0, 0)
# 创建第三个墙块
self.wall3 = rendering.make_polygon([(100, 250), (150, 250), (150,200),(100,200)], filled=True)
self.wall3.set_color(0, 0, 0)
# 创建第四个墙块
self.wall4 = rendering.make_polygon([(150, 250), (200, 250), (200,200),(150,200)], filled=True)
self.wall4.set_color(0, 0, 0)
# 创建第五个墙块
self.wall5 = rendering.make_polygon([(200, 150), (250, 150), (250, 100),(200, 100)], filled=True)
self.wall5.set_color(0, 0, 0)
# 创建第六个墙块
self.wall6 = rendering.make_polygon([(250, 150), (300, 150), (300, 100),(250, 100)], filled=True)
self.wall6.set_color(0, 0, 0)
# 创建第七个墙块
self.wall7 = rendering.make_polygon([(300, 150), (350, 150), (350, 100),(300, 100)], filled=True)
self.wall7.set_color(0, 0, 0)
# 创建出口,用三角形表示
self.goal = rendering.make_polygon([(325,245),(300,205),(350,205)],
filled=True)
self.goal.set_color(1, 0.9, 0)
# 创建机器人,用不同颜色的圆表示
self.robot = rendering.make_circle(15)
self.robotrans = rendering.Transform()
self.robot.add_attr(self.robotrans)
self.robot.set_color(0.8, 0.6, 0.4)
# 给11条直线设置颜色,并将这些创建的对象添加到几何中
self.line1.set_color(0, 0, 0)
self.line2.set_color(0, 0, 0)
self.line3.set_color(0, 0, 0)
self.line4.set_color(0, 0, 0)
self.line5.set_color(0, 0, 0)
self.line6.set_color(0, 0, 0)
self.line7.set_color(0, 0, 0)
self.line8.set_color(0, 0, 0)
self.line9.set_color(0, 0, 0)
self.line10.set_color(0, 0, 0)
self.line11.set_color(0, 0, 0)
self.line12.set_color(0, 0, 0)
self.viewer.add_geom(self.line1)
self.viewer.add_geom(self.line2)
self.viewer.add_geom(self.line3)
self.viewer.add_geom(self.line4)
self.viewer.add_geom(self.line5)
self.viewer.add_geom(self.line6)
self.viewer.add_geom(self.line7)
self.viewer.add_geom(self.line8)
self.viewer.add_geom(self.line9)
self.viewer.add_geom(self.line10)
self.viewer.add_geom(self.line11)
self.viewer.add_geom(self.line12)
self.viewer.add_geom(self.wall1)
self.viewer.add_geom(self.wall2)
self.viewer.add_geom(self.wall3)
self.viewer.add_geom(self.wall4)
self.viewer.add_geom(self.wall5)
self.viewer.add_geom(self.wall6)
self.viewer.add_geom(self.wall7)
self.viewer.add_geom(self.goal)
self.viewer.add_geom(self.robot)
# 根据这两个向量和机器人当前状态,就可以设置机器人当前的圆心坐标
if self.state is None: return None
self.robotrans.set_translation(self.x[self.state - 1], self.y[self.state - 1])
return self.viewer.render(return_rgb_array=mode =='rgb_array')
import gym
import random
import time
env = gym.make('MazeWorld-v0')
class Learn:
def __init__(self,maze_mdp):
# 初始化状态值函数
self.v = dict()#是个字典
for state in maze_mdp.states:
self.v[state]=0
# 初始化策略,这些策略都在状态转移概率矩阵中
self.pi=dict()
# random.choice(seq):返回列表、元组、字符串的随机项
self.pi[1] = random.choice(['s', 'e'])
self.pi[2] = random.choice(['w', 'e', 's'])
self.pi[3] = random.choice(['w', 's'])
self.pi[4] = random.choice(['s'])
self.pi[5] = random.choice(['e', 'n'])
self.pi[6] = random.choice(['w', 'e', 'n'])
self.pi[7] = random.choice(['w', 'n', 's'])
self.pi[8] = random.choice(['e', 'n', 's'])
self.pi[9] = random.choice(['e', 'n', 's'])
self.pi[10] = random.choice(['w', 'e', 's'])
self.pi[11] = random.choice(['w', 'n', 's'])
self.pi[12] = random.choice(['e', 's'])
self.pi[13] = random.choice(['w', 'e', 's'])
self.pi[14] = random.choice(['w', 'e', 'n'])
self.pi[15] = random.choice(['w', 'e', 'n'])
self.pi[16] = random.choice(['w', 'e', 'n'])
self.pi[17] = random.choice(['e', 'n'])
self.pi[18] = random.choice(['w', 'n'])
# 值迭代
def value_iteration(self, grid_mdp):
# 大循环保证值函数收敛
for i in range(1000):
delta = 0.0
# 遍历整个状态空间
for state in grid_mdp.states:
if state in grid_mdp.terminate_states: continue
a1 = grid_mdp.actions[0]
t, s, r = grid_mdp.transform(state, a1)
if s != -1:
v1 = r + grid_mdp.gamma * self.v[s]
# 遍历动作空间选择动作
for action in grid_mdp.actions:
t, s, r = grid_mdp.transform(state, action)
if s != -1:
if v1 < r + grid_mdp.gamma * self.v[s]:
a1 = action
v1 = r + grid_mdp.gamma * self.v[s]
delta += abs(v1 - self.v[state])
self.pi[state] = a1
self.v[state] = v1
if delta < 1e-6:
break
# 最优动作
def action(self,state):
return self.pi[state]
gm = env.env
# 初始化智能体状态
state = env.reset()
# 实例化对象,获得初始化状态值函数和初始化策略
learn=Learn(gm)
#策略评估和策略改善
learn.value_iteration(gm)
total_reward = 0
#最多走100步达到终止状态
for i in range(100):
env.render()
#每一个状态的策略都是最优策略
action = learn.action(state)
# 每一步按照最优策略走
state,reward,done,_ = env.step(action)
total_reward+=reward
time.sleep(1)
if done:
# 显示环境中物体进入终止状态的图像
env.render()
break