以最短距离到达终点
import numpy as np
import pandas as pd
import time
np.random.seed(2) # 生成相同的随机数
N_STATES = 6 # 距离
ACTIONS = ['left', 'right'] # 可进行的操作
EPSILON = 0.9 # 选择最优值概率
ALPHA = 0.1 # 学习效率
GAMMA = 0.9 # 衰减度
MAX_EPISODES = 13 # 最大回合次数
FRESH_TIME = 0.3 # 速度
def build_q_table(n_states, actions):
#创建去q_table列表
table = pd.DataFrame(
np.zeros((n_states, len(actions))), # q_table 原始值
columns=actions, # 列索引名称 行所以为默认值
)
return table
def choose_action(state, q_table):
#创建选择动作功能
state_actions = q_table.iloc[state, :]
if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):
action_name = np.random.choice(ACTIONS)
#初始时以及10%随机选择动作
else:
action_name = state_actions.idxmax()
#90%选择最优值动作
return action_name
def get_env_feedback(S, A):
# 环境交互
if A == 'right': #向右移动
if S == N_STATES - 2: # 终点
S_ = 'terminal'
R = 1#奖励值
else:
S_ = S + 1
R = 0
else: # 向左移动
R = 0
if S == 0:
S_ = S
else:
S_ = S - 1
return S_, R
def update_env(S, episode, step_counter):
# 创建环境
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T'
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
def rl():
# 创建主循环
q_table = build_q_table(N_STATES, ACTIONS)
for episode in range(MAX_EPISODES):
step_counter = 0
#起始位置
S = 0
is_terminated = False
update_env(S, episode, step_counter)
while not is_terminated:
A = choose_action(S, q_table)
S_, R = get_env_feedback(S, A) # 进行移动并得到相应的奖励
q_predict = q_table.loc[S, A]
if S_ != 'terminal':
q_target = R + GAMMA * q_table.iloc[S_, :].max() #下一个状态不是终点
else:
q_target = R # 下一个状态是终点
is_terminated = True # 跳出while循环
q_table.loc[S, A] += ALPHA * (q_target - q_predict) # 更新环境反馈
S = S_ # 移动到下一位置
update_env(S, episode, step_counter+1)
step_counter += 1
return q_table
if __name__ == "__main__":
q_table = rl()
print('\r\nQ-table:\n')
print(q_table)
总共有三部分:环境,决策和交互更新
环境部分不做学习
下面是更新部分
from maze_env import Maze#环境
from RL_brain import QLearningTable#大脑
def update():
for episode in range(100):
# 起始位置(观测值)
observation = env.reset()
while True:
# 刷新环境
env.render()
# 基于观测值挑选动作
action = RL.choose_action(str(observation))
# 执行动作并得到反馈
observation_, reward, done = env.step(action)
# 学习总结
RL.learn(str(observation), action, reward, str(observation_))
# 继续下一步
observation = observation_
# 完成或入坑结束循环
if done:
break
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
决策部分
import numpy as np
import pandas as pd
class QLearningTable:
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = actions
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def choose_action(self, observation):
self.check_state_exist(observation)
# 选择动作
if np.random.uniform() < self.epsilon:
# 选择最优值
state_action = self.q_table.loc[observation, :]
# 有些动作值一样,随机选一个
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# 随机选动作
action = np.random.choice(self.actions)
return action
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # 下一个状态不是终点
else:
q_target = r # 下一个状态是终点
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # 更新q_table
def check_state_exist(self, state):
if state not in self.q_table.index:
# 添加到q_table中
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
环境部分
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example. The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update()
def update():
for t in range(10):
s = env.reset()
while True:
env.render()
a = 1
s, r, done = env.step(a)
if done:
break
if __name__ == '__main__':
env = Maze()
env.after(100, update)
env.mainloop()