强化学习实践(一)基于MDP策略迭代计算Frozenlake问题

1、策略迭代代码

#use Policy Iteration to solve Frozenlake problem
import numpy as np
import gym

# Runs an episode and return the total reward
def Run_episode(env, policy, gamma = 1.0, render = False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done, _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


def evaluate_policy(env, policy, gamma = 1.0, n = 100):
    scores = [Run_episode(env, policy, gamma, False) for _ in range(n)]
    return np.mean(scores)


# function state
def Policy_evulation(env, gamma, policy, v):
    # threshold value of convergence
    theta = 1e-10
    while True:
        delta = 0
        previosu_v = np.copy(v)
        for state in range(env.observation_space.n):
            action = policy[state]
            present_v = 0
            for p, s_next, r, _ in env.env.P[state][action]:
                #calculate the state value function 
                present_v = present_v + p*(r + gamma * previosu_v[s_next])
            v[state] = present_v
            # find max_error
            if delta<abs(v[state]-previosu_v[state]):
                delta = abs(v[state]-previosu_v[state])
        #  Determine whethere or not it converges
        if delta<theta:
            return v


# function : Policy improvement 
def Policy_improvement(env, v, gamma):
    Policy=np.random.choice(env.action_space.n, env.observation_space.n)
    for state in range(env.observation_space.n):
        q_value = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            # p:the state transition probablity  s_next:the next state s'  r:reward(s→s')  _ true of false
            #calculate the state-action value
            q_value[action] = sum([(p*(r + gamma*v[s_next])) for p, s_next, r, _ in env.env.P[state][action]])
        Policy[state]=np.argmax(q_value)
    return Policy

# function: Policy Iteration 
def Policy_Iteration(env,gamma):
    # np.random.choice(a,size = b)从a的范围内, 依概率p选择b个数据
    policy=np.random.choice(env.action_space.n,size = (env.observation_space.n))
    v=np.zeros(env.observation_space.n)
    step = 0
    while True:
        step += 1
        # First step: Policy evulation
        v=Policy_evulation(env = env,gamma = gamma,policy = policy,v = v)
        # Second step:Poicy improvement
        new_Policy=Policy_improvement(env = env, v = v, gamma = gamma )
        if np.all(new_Policy == policy):
            print('Policy-Iteration converged at step:',step)
            print("the best policy is:",new_Policy)
            return new_Policy
        policy = new_Policy


if __name__ == '__main__':
    env_name = 'FrozenLake-v0'
    # env获取一个4*4的网格,一共有16个网格,分别数字0-15进行表示
    env = gym.make(env_name)
    optimal_policy = Policy_Iteration(env,gamma=1.0)
    scores = evaluate_policy(env, optimal_policy, gamma=1.0)
    print( "Average scores=",np.mean(scores) )


你可能感兴趣的:(强化学习,python)