1、策略迭代代码
import numpy as np
import gym
def Run_episode(env, policy, gamma = 1.0, render = False):
obs = env.reset()
total_reward = 0
step_idx = 0
while True:
if render:
env.render()
obs, reward, done, _ = env.step(int(policy[obs]))
total_reward += (gamma ** step_idx * reward)
step_idx += 1
if done:
break
return total_reward
def evaluate_policy(env, policy, gamma = 1.0, n = 100):
scores = [Run_episode(env, policy, gamma, False) for _ in range(n)]
return np.mean(scores)
def Policy_evulation(env, gamma, policy, v):
theta = 1e-10
while True:
delta = 0
previosu_v = np.copy(v)
for state in range(env.observation_space.n):
action = policy[state]
present_v = 0
for p, s_next, r, _ in env.env.P[state][action]:
present_v = present_v + p*(r + gamma * previosu_v[s_next])
v[state] = present_v
if delta<abs(v[state]-previosu_v[state]):
delta = abs(v[state]-previosu_v[state])
if delta<theta:
return v
def Policy_improvement(env, v, gamma):
Policy=np.random.choice(env.action_space.n, env.observation_space.n)
for state in range(env.observation_space.n):
q_value = np.zeros(env.action_space.n)
for action in range(env.action_space.n):
q_value[action] = sum([(p*(r + gamma*v[s_next])) for p, s_next, r, _ in env.env.P[state][action]])
Policy[state]=np.argmax(q_value)
return Policy
def Policy_Iteration(env,gamma):
policy=np.random.choice(env.action_space.n,size = (env.observation_space.n))
v=np.zeros(env.observation_space.n)
step = 0
while True:
step += 1
v=Policy_evulation(env = env,gamma = gamma,policy = policy,v = v)
new_Policy=Policy_improvement(env = env, v = v, gamma = gamma )
if np.all(new_Policy == policy):
print('Policy-Iteration converged at step:',step)
print("the best policy is:",new_Policy)
return new_Policy
policy = new_Policy
if __name__ == '__main__':
env_name = 'FrozenLake-v0'
env = gym.make(env_name)
optimal_policy = Policy_Iteration(env,gamma=1.0)
scores = evaluate_policy(env, optimal_policy, gamma=1.0)
print( "Average scores=",np.mean(scores) )