reinforce_learning学习笔记

 

policy_gradient

import numpy as np


def get_action_and_next_s(pis, s):  # 移动一步后求得状态s的函数定义
    directions = ["up", "right", "down", "left"]
    next_directions = np.random.choice(directions, p=pis[s, :])  # 根据pi[s,:]选择direction
    if next_directions == "up":
        action = 0
        s_next = s - 3  # 上移-3
    elif next_directions == "right":
        action = 1
        s_next = s + 1  # 右移+1
    elif next_directions == "down":
        action = 2
        s_next = s + 3  # 下移+3
    elif next_directions == "left":
        action = 3
        s_next = s - 1  # 左移-1
    return [action, s_next]


def goal_maze_ret_s_a(pi):  # 主要循环函数(每执行一次这个函数都是跑到终点并返回路径记录)
    s = 0
    s_a_history = []
    while (1):
        [action, next_s] = get_action_and_next_s(pi, s)
        s_a_history.append([s, action])
        s = next_s
        if s == 8:
            s_a_history.append([8, np.nan])
            break
    return s_a_history


def update_theta(theta, pi, s_a_history):  # theta更新(超级重点,theta更新后再来求pi)
    eta = 0.1  # 学习率
    T = len(s_a_history) - 1  # 总步数
    [m, n] = theta.shape  # theta矩阵大小
    delta_theta = theta.copy()  # Δtheta,使用copy是为了不指向同一内存地址
    for i in range(0, m):
        for j in range(0, n):
            if not (np.isnan(theta[i, j])):  # theta不是nan时
                SA_i = [SA for SA in s_a_history if SA[0] == i]  # 读出状态i
                SA_ij = [SA for SA in s_a_history if SA == [i, j]]  # 读出状态i中采取的动作j
                N_i = len(SA_i)  # 状态i下动作总次数
                N_ij = len(SA_ij)  # 状态i下采取j动作的次数
                delta_theta[i, j] = (N_ij - pi[i, j] * N_i) / T  # T相当于学习率(有时可能会往坏的方向学习,但是权重就会低)
    new_theta = theta + eta * delta_theta  # 意义是他期望走的次数
    return new_theta


def softmax_convert_into_pi_from_theta(theta):  # 将策略参数theta转换为行动策略pi的函数定义
    [m, n] = theta.shape  # theta矩阵大小
    pi = np.zeros((m, n))
    exp_theta = np.exp(theta)  # theta矩阵大小
    for i in range(0, m):
        pi[i, :] = exp_theta[i, :] / np.nansum(exp_theta[i, :])  # softmax好处是负数也能算权重
    pi = np.nan_to_num(pi)  # nan变0
    return pi


theta = np.array([[np.nan, 1, 1, np.nan],  # s0
                  [np.nan, 1, np.nan, 1],  # s1
                  [np.nan, np.nan, 1, 1],  # s2
                  [1, 1, 1, np.nan],  # s3
                  [np.nan, np.nan, 1, 1],  # s4
                  [1, np.nan, np.nan, np.nan],  # s5
                  [1, np.nan, np.nan, np.nan],  # s6
                  [1, 1, np.nan, np.nan],  # s7、※s8不需要状态
                  ])  # 行为状态0到7(即8个格),↑、→、↓、←是否可行用1与np.nan表示
pi = softmax_convert_into_pi_from_theta(theta)
stop_epsilon = 0.00001  # 策略变化小于0.0001则结束学习
is_continue = True
while is_continue:  # 重复直到标志位为假
    s_a_history = goal_maze_ret_s_a(pi)  # 由策略pi搜索迷宫探索历史
    new_theta = update_theta(theta, pi, s_a_history)  # theta更新
    new_pi = softmax_convert_into_pi_from_theta(new_theta)  # pi更新
    if np.sum(np.abs(new_pi - pi)) < stop_epsilon:
        is_continue = False
    else:
        theta = new_theta
        pi = new_pi
np.set_printoptions(precision=3, suppress=True)  # 有效位数3、不显示指数
print(pi)

sarsa_qlearning

import numpy as np


def get_action(s, Q, epsilon, pi_0):
    direction = ["up", "right", "down", "left"]
    if np.random.rand() < epsilon:
        next_direction = np.random.choice(direction, p=pi_0[s, :])  # 有epsilon的概率随机
    else:
        next_direction = direction[np.nanargmax(Q[s, :])]  # 有1-epsilon的概率根据Q表来做决策(与pi表完全无关)
    if next_direction == "up":
        action = 0
    elif next_direction == "right":
        action = 1
    elif next_direction == "down":
        action = 2
    elif next_direction == "left":
        action = 3
    return action


def get_s_next(s, a):
    direction = ["up", "right", "down", "left"]
    next_direction = direction[a]
    if next_direction == "up":
        s_next = s - 3
    elif next_direction == "right":
        s_next = s + 1
    elif next_direction == "down":
        s_next = s + 3
    elif next_direction == "left":
        s_next = s - 1
    return s_next


def Sarsa_or_Qlearning(s, a, r, s_next, a_next, Q, eta, gamma):  # Sarsa全文关键:更新Q表
    if s_next == 8:
        Q[s, a] = Q[s, a] + eta * (r - Q[s, a])
    else:
        Q[s, a] = Q[s, a] + eta * (r + gamma * Q[s_next, a_next] - Q[s, a])  # Saras
        # Q[s, a] = Q[s, a] + eta * (r + gamma * np.nanmax(Q[s_next, :]) - Q[s, a])  # Qlearning
    return Q


def goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi):  # 收益表,贪婪概率,学习率,利率,策略表
    s = 0
    a = a_next = get_action(s, Q, epsilon, pi)
    while (1):
        a = a_next
        s_next = get_s_next(s, a)
        if s_next == 8:
            r = 1
            a_next = np.nan
        else:
            r = 0
            a_next = get_action(s_next, Q, epsilon, pi)
        Q = Sarsa_or_Qlearning(s, a, r, s_next, a_next, Q, eta, gamma)
        if s_next == 8:
            break
        else:
            s = s_next
    return Q


theta = np.array([[np.nan, 1, 1, np.nan],  # s0
                  [np.nan, 1, np.nan, 1],  # s1
                  [np.nan, np.nan, 1, 1],  # s2
                  [1, 1, 1, np.nan],  # s3
                  [np.nan, np.nan, 1, 1],  # s4
                  [1, np.nan, np.nan, np.nan],  # s5
                  [1, np.nan, np.nan, np.nan],  # s6
                  [1, 1, np.nan, np.nan],  # s7
                  ])
[m, n] = theta.shape
pi = np.zeros((m, n))
for i in range(0, m):
    pi[i, :] = theta[i, :] / np.nansum(theta[i, :])
pi = np.nan_to_num(pi)
Q = np.zeros((m, n)) * theta
epsilon = 1
# 法一、先随机后贪婪
# for i in range(1000):
#     Q = goal_maze_ret_s_a_Q(Q = Q, epsilon = 1, eta = 0.1, gamma = 0.9, pi = pi)
# for i in range(1000):
#     Q = goal_maze_ret_s_a_Q(Q = Q, epsilon = 0, eta = 0.1, gamma = 0.9, pi = pi)
# print(Q)
# 法二、epsilon贪婪
for i in range(1000):
    epsilon = epsilon / 2
    Q = goal_maze_ret_s_a_Q(Q=Q, epsilon=epsilon, eta=0.1, gamma=0.9, pi=pi)
print(Q)

 

你可能感兴趣的:(2021年研究生学习笔记)