强化学习实践(二)基于MDP价值迭代的解FrozenLake问题

1、基于MDP价值迭代的解FrozenLake问题

import numpy as np
import gym

def run_episode(env, policy, gamma = 1.0, render = False):
    """ Evaluates policy by using it to run an episode and finding its
    total reward.
    args:
    env: gym environment.
    policy: the policy to be used.
    gamma: discount factor.
    render: boolean to turn rendering on/off.

    returns:
    total reward: real value of the total reward recieved by agent under policy.
    """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


def evaluate_policy(env, policy, gamma = 1.0,  n = 100):
    """ Evaluates a policy by running it n times.
    returns:
    average total reward
    """
    scores = [
            run_episode(env, policy, gamma = gamma, render = False)
            for _ in range(n)]
    return np.mean(scores)


"""function state:value iteration algrorithm """
def Value_Iteration(env,gamma):
    # "the interation step "
    max_iteration = 20000
    theta = 1e-20
    v=np.zeros(env.observation_space.n)
    for i in range(max_iteration):
        pre_v = np.copy(v)
        for state in range(env.observation_space.n):
            # 储存状态s下的,不同的(s-a)价值
            q_s = np.zeros(env.action_space.n)
            for action in range(env.action_space.n):
            # 计算最大的状态价值函数v(v = max(q(s,a)))
              q_s[action] = sum(p * (r + gamma * (pre_v[s_next])) for p, s_next, r, _ in env.env.P[state][action])
            v[state] = max(q_s)
        if (np.sum(np.fabs(pre_v - v)) <= theta):
            print("the value iteration congverged at iteration:",i+1)
            break
    return v


def extract_policy(env,gamma,v):
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        q_s = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.env.P[state][action]:
                p, s, r, _ = next_sr
                # 根据公式计算最优状态动作价值函数
                q_s[action] += (p * (r + gamma * v[s]))
        policy[state] = np.argmax(q_s)
    return policy


if __name__ == '__main__':

    env_name='FrozenLake-v0'
    env=gym.make(env_name)
    gamma=1.0
    optimal_value = Value_Iteration(env=env,gamma=gamma)
    optimal_policy = extract_policy(env=env, gamma=gamma,v=optimal_value)
    print(optimal_policy)
    policy_score = evaluate_policy(env, optimal_policy, gamma, n=1000)
    print('Policy average score = ', policy_score)

你可能感兴趣的:(机器学习)