价值网络和策略网络的简单融合

最近alphazero都已经出来了,貌似比alphago zero更厉害,在alphazero和alphago zero中使用了比较
新的策略,将价值网络和策略网络进行了融合,即同一个网络,产生两个不同的输出,让两个网络的权重进行
共享,同时进行更新,为了加深理解,在最简单的游戏cartpole上进行了尝试.实际上将价值网络和策略网络
进行融合,实现起来应该是比较简单的,需要注意的一个小问题是,在之前的价值网络和策略网络,其学习率
不一致,因此将两者融合后需要采用较小的学习率,直接给出代码:

https://github.com/zhly0/policy_value.py

import tensorflow as tf
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt

def softmax(x):
    e_x = np.exp(x - np.max(x))
    out = e_x / e_x.sum()
    return out


def policy_value():
    with tf.variable_scope("policy_value"):
        state = tf.placeholder("float",[None,4])

        #newvals is future reward
        newvals = tf.placeholder("float",[None,1])

        w1 = tf.get_variable("w1",[4,10])
        b1 = tf.get_variable("b1",[10])

        h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
        w2 = tf.get_variable("w2",[10,2])
        b2 = tf.get_variable("b2",[2])

        w3 = tf.get_variable("w3",[10,1])
        b3 = tf.get_variable("b3",[1])

        #policy gradient
        calculated = tf.matmul(h1,w2) + b2
        probabilities = tf.nn.softmax(calculated)

        actions = tf.placeholder("float",[None,2])
        advantages = tf.placeholder("float",[None,1])

        good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
        eligibility = tf.log(good_probabilities) * advantages
        loss1 = -tf.reduce_sum(eligibility)

        #value gradient
        calculated1 = tf.matmul(h1,w3) + b3
        diffs = calculated1 - newvals
        loss2 = tf.nn.l2_loss(diffs)

        #policy loss + value loss
        loss = loss1+loss2

        optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)#AdamOptimizer

        return probabilities,calculated1, actions,state,advantages, newvals, optimizer, loss1,loss2

def run_episode(env, policy_value, sess,is_train = True):    
    p_probabilities,v_calculated,p_actions, pv_state, p_advantages, v_newvals, pv_optimizer,loss1,loss2 = policy_value

    observation = env.reset()
    totalreward = 0
    states = []
    actions = []
    advantages = []
    transitions = []
    update_vals = []


    for _ in range(200):
        # calculate policy
        obs_vector = np.expand_dims(observation, axis=0)
        #calculate action according to current state
        probs = sess.run(p_probabilities,feed_dict={pv_state: obs_vector})

        action = 1 if probs[0][0]0][1] else 0
        #take a random action when training
        if is_train:
            action = 0 if random.uniform(0,1) < probs[0][0] else 1
        # record the transition
        states.append(observation)
        actionblank = np.zeros(2)
        actionblank[action] = 1
        actions.append(actionblank)
        # take the action in the environment
        old_observation = observation
        observation, reward, done, info = env.step(action)
        transitions.append((old_observation, action, reward))
        totalreward += reward

        if done:
            break
    #return totalreward if it is testing
    if not is_train:
        return totalreward

    #training
    for index, trans in enumerate(transitions):
        obs, action, reward = trans

        # calculate discounted monte-carlo return
        future_reward = 0
        future_transitions = len(transitions) - index
        decrease = 1
        for index2 in range(future_transitions):
            future_reward += transitions[(index2) + index][2] * decrease
            decrease = decrease * 0.97
        obs_vector = np.expand_dims(obs, axis=0)
        #value function: calculate max reward under current state 
        currentval = sess.run(v_calculated,feed_dict={pv_state: obs_vector})[0][0]

        # advantage: how much better was this action than normal
        # 根据实际数据得到future_reward比值函数计算出来的reward要好多少
        # 训练到后来,这个currentval:即在当前reward会估计的比较准确,在当前state下能够获得的
        # 最大reward或者平均reward,而有了这个估计,用实际的reward减去这个reward,就可以判断这个
        # action的好坏,即这个currentval是训练时用来评估某个action的好坏
        # 用future_reward减去这个最大reward,就得到了这个action
        # 对应的label,如果比估计的值更大,那说明要根据该参数进行更新,如果比该值小,那说明
        # 达不到平均水平,那么将将该action对应的梯度进行反向更新(相减为负值),使得下次碰到这个
        # 类似的state的时候,不再采取这个action
        advantages.append(future_reward - currentval)

        #advantages.append(future_reward-2.0)

        update_vals.append(future_reward)

    # update value function
    update_vals_vector = np.expand_dims(update_vals, axis=1)

    advantages_vector = np.expand_dims(advantages, axis=1)
    #train network
    _,print_loss1,print_loss2 = sess.run([pv_optimizer,loss1,loss2], feed_dict={pv_state: states,v_newvals: update_vals_vector, p_advantages: advantages_vector, p_actions: actions})

    print("policy loss ",print_loss1)
    print("value loss ",print_loss2)
    return totalreward


env = gym.make('CartPole-v0')

PolicyValue = policy_value()

sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

for i in range(1500):
    reward = run_episode(env, PolicyValue, sess)

t = 0
for _ in range(1000):
    #env.render()
    reward = run_episode(env, PolicyValue, sess,False)
    t += reward
print(t / 1000)

你可能感兴趣的:(深度学习(deep,learning),learning)