policy_gradient
import numpy as np
def get_action_and_next_s(pis, s): # 移动一步后求得状态s的函数定义
directions = ["up", "right", "down", "left"]
next_directions = np.random.choice(directions, p=pis[s, :]) # 根据pi[s,:]选择direction
if next_directions == "up":
action = 0
s_next = s - 3 # 上移-3
elif next_directions == "right":
action = 1
s_next = s + 1 # 右移+1
elif next_directions == "down":
action = 2
s_next = s + 3 # 下移+3
elif next_directions == "left":
action = 3
s_next = s - 1 # 左移-1
return [action, s_next]
def goal_maze_ret_s_a(pi): # 主要循环函数(每执行一次这个函数都是跑到终点并返回路径记录)
s = 0
s_a_history = []
while (1):
[action, next_s] = get_action_and_next_s(pi, s)
s_a_history.append([s, action])
s = next_s
if s == 8:
s_a_history.append([8, np.nan])
break
return s_a_history
def update_theta(theta, pi, s_a_history): # theta更新(超级重点,theta更新后再来求pi)
eta = 0.1 # 学习率
T = len(s_a_history) - 1 # 总步数
[m, n] = theta.shape # theta矩阵大小
delta_theta = theta.copy() # Δtheta,使用copy是为了不指向同一内存地址
for i in range(0, m):
for j in range(0, n):
if not (np.isnan(theta[i, j])): # theta不是nan时
SA_i = [SA for SA in s_a_history if SA[0] == i] # 读出状态i
SA_ij = [SA for SA in s_a_history if SA == [i, j]] # 读出状态i中采取的动作j
N_i = len(SA_i) # 状态i下动作总次数
N_ij = len(SA_ij) # 状态i下采取j动作的次数
delta_theta[i, j] = (N_ij - pi[i, j] * N_i) / T # T相当于学习率(有时可能会往坏的方向学习,但是权重就会低)
new_theta = theta + eta * delta_theta # 意义是他期望走的次数
return new_theta
def softmax_convert_into_pi_from_theta(theta): # 将策略参数theta转换为行动策略pi的函数定义
[m, n] = theta.shape # theta矩阵大小
pi = np.zeros((m, n))
exp_theta = np.exp(theta) # theta矩阵大小
for i in range(0, m):
pi[i, :] = exp_theta[i, :] / np.nansum(exp_theta[i, :]) # softmax好处是负数也能算权重
pi = np.nan_to_num(pi) # nan变0
return pi
theta = np.array([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, 1, 1, np.nan], # s3
[np.nan, np.nan, 1, 1], # s4
[1, np.nan, np.nan, np.nan], # s5
[1, np.nan, np.nan, np.nan], # s6
[1, 1, np.nan, np.nan], # s7、※s8不需要状态
]) # 行为状态0到7(即8个格),↑、→、↓、←是否可行用1与np.nan表示
pi = softmax_convert_into_pi_from_theta(theta)
stop_epsilon = 0.00001 # 策略变化小于0.0001则结束学习
is_continue = True
while is_continue: # 重复直到标志位为假
s_a_history = goal_maze_ret_s_a(pi) # 由策略pi搜索迷宫探索历史
new_theta = update_theta(theta, pi, s_a_history) # theta更新
new_pi = softmax_convert_into_pi_from_theta(new_theta) # pi更新
if np.sum(np.abs(new_pi - pi)) < stop_epsilon:
is_continue = False
else:
theta = new_theta
pi = new_pi
np.set_printoptions(precision=3, suppress=True) # 有效位数3、不显示指数
print(pi)
sarsa_qlearning
import numpy as np
def get_action(s, Q, epsilon, pi_0):
direction = ["up", "right", "down", "left"]
if np.random.rand() < epsilon:
next_direction = np.random.choice(direction, p=pi_0[s, :]) # 有epsilon的概率随机
else:
next_direction = direction[np.nanargmax(Q[s, :])] # 有1-epsilon的概率根据Q表来做决策(与pi表完全无关)
if next_direction == "up":
action = 0
elif next_direction == "right":
action = 1
elif next_direction == "down":
action = 2
elif next_direction == "left":
action = 3
return action
def get_s_next(s, a):
direction = ["up", "right", "down", "left"]
next_direction = direction[a]
if next_direction == "up":
s_next = s - 3
elif next_direction == "right":
s_next = s + 1
elif next_direction == "down":
s_next = s + 3
elif next_direction == "left":
s_next = s - 1
return s_next
def Sarsa_or_Qlearning(s, a, r, s_next, a_next, Q, eta, gamma): # Sarsa全文关键:更新Q表
if s_next == 8:
Q[s, a] = Q[s, a] + eta * (r - Q[s, a])
else:
Q[s, a] = Q[s, a] + eta * (r + gamma * Q[s_next, a_next] - Q[s, a]) # Saras
# Q[s, a] = Q[s, a] + eta * (r + gamma * np.nanmax(Q[s_next, :]) - Q[s, a]) # Qlearning
return Q
def goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi): # 收益表,贪婪概率,学习率,利率,策略表
s = 0
a = a_next = get_action(s, Q, epsilon, pi)
while (1):
a = a_next
s_next = get_s_next(s, a)
if s_next == 8:
r = 1
a_next = np.nan
else:
r = 0
a_next = get_action(s_next, Q, epsilon, pi)
Q = Sarsa_or_Qlearning(s, a, r, s_next, a_next, Q, eta, gamma)
if s_next == 8:
break
else:
s = s_next
return Q
theta = np.array([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, 1, 1, np.nan], # s3
[np.nan, np.nan, 1, 1], # s4
[1, np.nan, np.nan, np.nan], # s5
[1, np.nan, np.nan, np.nan], # s6
[1, 1, np.nan, np.nan], # s7
])
[m, n] = theta.shape
pi = np.zeros((m, n))
for i in range(0, m):
pi[i, :] = theta[i, :] / np.nansum(theta[i, :])
pi = np.nan_to_num(pi)
Q = np.zeros((m, n)) * theta
epsilon = 1
# 法一、先随机后贪婪
# for i in range(1000):
# Q = goal_maze_ret_s_a_Q(Q = Q, epsilon = 1, eta = 0.1, gamma = 0.9, pi = pi)
# for i in range(1000):
# Q = goal_maze_ret_s_a_Q(Q = Q, epsilon = 0, eta = 0.1, gamma = 0.9, pi = pi)
# print(Q)
# 法二、epsilon贪婪
for i in range(1000):
epsilon = epsilon / 2
Q = goal_maze_ret_s_a_Q(Q=Q, epsilon=epsilon, eta=0.1, gamma=0.9, pi=pi)
print(Q)