如下图所示,迷宫问题是16宫格寻迹问题,需要在最短时间内从左上角起点到达右下角终点(黑色部分为障碍处不可走)。
# step1 导入依赖
import gym
import numpy as np
import time
# step2
# Agent是和环境environment交互的主体。
# predict()方法:输入观察值observation(或者说状态state),输出动作值
# sample()方法:在predict()方法基础上使用ε-greedy增加探索
# learn()方法:输入训练数据,完成一轮Q表格的更新
class SarsaAgent(object):
def __init__(self, obs_n, act_n, lr=0.001, gamma=0.99, epsilon=0.1):
"""
:param obs_n:状态空间
:param act_n:动作空间
:param lr:学习率或步长
:param gamma:衰减因子
:param epsilon:# 按一定概率随机选动作
"""
self.lr = lr
self.obs_n = obs_n
self.act_n = act_n
self.gamma = gamma
self.epsilon = epsilon
self.Q_table = np.zeros((obs_n, act_n))
# 根据输入观察值,采样输出的动作值,带探索
def sample(self, obs):
if np.random.uniform(0, 1) < (1.0 - self.epsilon):
action = self.predict(obs)
else:
action = np.random.choice(self.act_n)
return action
# 根据输入观察值,预测输出的动作值
def predict(self, obs):
Q_list = self.Q_table[obs, :]
maxQ = np.max(Q_list)
act_list = np.where(Q_list == maxQ)[0]
action = np.random.choice(act_list)
return action
# 学习方法,也就是更新Q-table的方法
def learn(self, obs, act, reward, next_obs, done):
'''
off-policy
:param obs:交互前的obs
:param act:本次交互选择的动作
:param reward:本次动作获得的奖励
:param next_obs:本次交互后的obs
:param done:episode是否结束
:return:
'''
predict_Q = self.Q_table[obs, act]
if done:
target_Q = reward
else:
target_Q = reward + self.gamma * np.max( self.Q_table[next_obs])
predict_Q += self.lr * (target_Q - predict_Q)
self.Q_table[obs, act] = predict_Q
# 保存Q表格数据到文件
def save(self):
npy_file = './q_table.npy'
np.save(npy_file, self.Q)
print(npy_file + ' saved.')
# 从文件中读取数据到Q表格中
def restore(self, npy_file='./q_table.npy'):
self.Q = np.load(npy_file)
print(npy_file + ' loaded.')
# step3
# Training && Test(训练&&测试)
def train_episode(env, agent, render=False):
total_steps = 0
total_reward = 0
obs = env.reset()
action = agent.sample(obs)
while True:
next_obs, reward, done, _ = env.step(action)
next_action = agent.sample(next_obs)
# 训练Sarsa算法
agent.learn(obs, action, reward, next_obs, done)
action = next_action
obs = next_obs
total_reward += reward
total_steps += 1
if render:
env.render() # 渲染新的一帧图
if done:
break
return total_reward, total_steps
def test_episode(env, agent):
total_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs)
next_obs, reward, done, _ = env.step(action)
total_reward += reward
obs = next_obs
# time.sleep(0.5)
env.render()
if done:
break
return total_reward
# Step4 创建环境和Agent,启动训练
# 使用gym创建迷宫环境,设置is_slippery为False降低环境难度
env = gym.make('FrozenLake-v0', is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# 创建一个agent实例,输入超参数
agent = SarsaAgent(
obs_n=env.observation_space.n,
act_n=env.action_space.n,
lr=0.001,
gamma=0.99,
epsilon=0.1
)
# 训练100个episode,打印每个episode的分数
for episode in range(200):
ep_reward, ep_steps = train_episode(env,agent,False)
if episode % 10 == 0:
print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, ep_reward))
# 全部训练结束,查看算法效果
test_reward = test_episode(env, agent)
print('test reward = %.1f' % (test_reward))
Episode 0: steps = 8 , reward = 0.0
Episode 10: steps = 15 , reward = 0.0
Episode 20: steps = 7 , reward = 0.0
Episode 30: steps = 18 , reward = 0.0
Episode 40: steps = 10 , reward = 0.0
Episode 50: steps = 37 , reward = 0.0
Episode 60: steps = 13 , reward = 0.0
Episode 70: steps = 5 , reward = 0.0
Episode 80: steps = 6 , reward = 1.0
Episode 90: steps = 6 , reward = 1.0
Episode 100: steps = 7 , reward = 1.0
Episode 110: steps = 6 , reward = 1.0
Episode 120: steps = 6 , reward = 1.0
Episode 130: steps = 7 , reward = 1.0
Episode 140: steps = 6 , reward = 1.0
Episode 150: steps = 6 , reward = 1.0
Episode 160: steps = 6 , reward = 1.0
Episode 170: steps = 6 , reward = 1.0
Episode 180: steps = 6 , reward = 1.0
Episode 190: steps = 7 , reward = 1.0
(Right)
SFFF
FHFH
FFFH
HFFG
(Right)
SFFF
FHFH
FFFH
HFFG
(Down)
SFFF
FHFH
FFFH
HFFG
(Down)
SFFF
FHFH
FFFH
HFFG
(Down)
SFFF
FHFH
FFFH
HFFG
(Right)
SFFF
FHFH
FFFH
HFFG
test reward = 1.0
Process finished with exit code 0