【强化学习】tensorflow2.x 构造 SoftActorCritic(SAC) 训练 LunarLanderContinuous-v2


  • 论文地址.
  • 策略网络的损失函数不是按照原论文所写。(X)
  • 更新:策略网络的损失函数已按照原论文更新。
  • 更新:加入了自适应温度参数 alpha 控制策略熵。

requirements.txt:
tensorflow-gpu==2.4.0
gym[all]==0.21.0
tensorflow_probability==0.14.0
keras==2.6.0
matplotlib==3.5.1
from tensorflow.keras import layers, models, Input, optimizers, losses
from tensorflow_probability.python.distributions import Normal
from collections import deque

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import copy
import gym

class SoftActorCritic:
    def __init__(self, state_shape, action_dim):
        self.ema = tf.train.ExponentialMovingAverage(decay=0.995)
        self.replay_buffer = deque(maxlen=10000)
        self.gamma = 0.997

        self.log_aplha = tf.Variable(np.random.normal(), trainable=True, name="EntropyTemperature")
        self.mini_entropy = 0.1

        self.policy_OPT = optimizers.Adam(learning_rate=1e-3)
        self.Q1_OPT = optimizers.Adam(learning_rate=1e-3)
        self.Q2_OPT = optimizers.RMSprop(learning_rate=1e-3)
        self.value_OPT = optimizers.Adam(learning_rate=1e-3)
        self.alpha_OPT = optimizers.SGD(learning_rate=1e-3)

        policy_input = Input(shape=state_shape)
        x = layers.Dense(units=1024, activation='relu')(policy_input)
        x = layers.Dense(units=1024, activation='relu')(x)
        policy_mean = layers.Dense(units=action_dim, activation='linear')(x)
        log_policy_std = layers.Dense(units=action_dim, activation='linear')(x)
        log_policy_std_clipped = tf.clip_by_value(log_policy_std, -10, 2)
        self.policy_network = models.Model(inputs=policy_input, outputs=[policy_mean, log_policy_std_clipped])

        value_input = Input(shape=state_shape)
        x = layers.Dense(units=1024, activation='relu')(value_input)
        x = layers.Dense(units=1024, activation='relu')(x)
        value_output = layers.Dense(units=1, activation='linear')(x)
        self.value_network = models.Model(inputs=value_input, outputs=value_output)
        self.target_value_network = models.clone_model(self.value_network)
        self._update_target_value_network()

        Q_state_input = Input(shape=state_shape)
        Q_action_input = Input(shape=(action_dim))
        x = layers.concatenate([Q_state_input, Q_action_input])
        x = layers.Dense(units=1024, activation='relu')(x)
        x = layers.Dense(units=1024, activation='relu')(x)
        Q_output = layers.Dense(units=1, activation='linear')(x)
        self.Q_network_1 = models.Model(inputs=[Q_state_input, Q_action_input], outputs=Q_output)
        self.Q_network_2 = models.clone_model(self.Q_network_1)

    def _update_target_value_network(self):
        self.ema.apply(self.value_network.trainable_variables)
        for target_value_network_para, value_network_para in zip(self.target_value_network.trainable_variables, self.value_network.trainable_variables):
            target_value_network_para.assign(self.ema.average(value_network_para))

    def save_memory(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def select_action(self, state):
        state = np.array([state])
        policy_mean, log_policy_std = self.policy_network(state)
        policy_mean = np.array(policy_mean[0])
        log_policy_std = np.array(log_policy_std[0])
        policy_std = np.exp(log_policy_std)
        gaussian_distribution = Normal(policy_mean, policy_std)
        action = np.tanh(gaussian_distribution.sample())
        return action

    def update_weights(self, batch_size):
        batch_size = min(batch_size, len(self.replay_buffer))
        training_data = random.sample(self.replay_buffer, batch_size)
        state, action, reward, next_state, done = [], [], [], [], []
        for data in training_data:
            s, a, r, n_s, d = data
            state.append(s)
            action.append(a)
            reward.append(r)
            next_state.append(n_s)
            done.append(d)
        state = np.array(state, dtype=np.float64)
        action = np.array(action, dtype=np.float64)
        reward = np.reshape(reward, newshape=(-1, 1))
        next_state = np.array(next_state, dtype=np.float64)
        done = np.reshape(done, newshape=(-1, 1))

        with tf.GradientTape() as tape:
            policy_mean, log_policy_std = self.policy_network(state)
            policy_std = tf.exp(log_policy_std)

            gaussian_distribution = Normal(policy_mean, policy_std)
            gaussian_sampling = gaussian_distribution.sample()

            sample_action = tf.tanh(gaussian_sampling)
            logprob = gaussian_distribution.log_prob(gaussian_sampling) - tf.math.log(
                1.0 - tf.pow(sample_action, 2) + 1e-6)

            logprob = tf.reduce_mean(logprob, axis=-1, keepdims=True)
            new_Q_value = tf.math.minimum(self.Q_network_1([state, sample_action]), self.Q_network_2([state, sample_action]))

            policy_loss = tf.reduce_mean(np.exp(self.log_aplha) * logprob - new_Q_value)

        policy_network_grad = tape.gradient(policy_loss, self.policy_network.trainable_variables)
        self.policy_OPT.apply_gradients(zip(policy_network_grad, self.policy_network.trainable_variables))

        with tf.GradientTape() as tape:
            alpha_loss = - tf.exp(self.log_aplha) * (tf.reduce_mean(tf.exp(logprob) * logprob) + self.mini_entropy)
        alpha_grad = tape.gradient(alpha_loss, [self.log_aplha])
        self.alpha_OPT.apply_gradients(zip(alpha_grad, [self.log_aplha]))

        with tf.GradientTape() as tape:
            value = self.value_network(state)
            value_ = tf.stop_gradient(new_Q_value - np.exp(self.log_aplha) * logprob)
            value_loss = tf.reduce_mean(losses.mean_squared_error(value_, value))
        value_network_grad = tape.gradient(value_loss, self.value_network.trainable_variables)
        self.value_OPT.apply_gradients(zip(value_network_grad, self.value_network.trainable_variables))

        target_value = tf.stop_gradient(self.target_value_network(next_state))
        Q_ = reward + self.gamma * (1 - done) * target_value

        with tf.GradientTape() as tape:
            Q_1 = self.Q_network_1([state, action])
            Q_1_loss = tf.reduce_mean(losses.mean_squared_error(Q_, Q_1))
        Q_network_1_grad = tape.gradient(Q_1_loss, self.Q_network_1.trainable_variables)
        self.Q1_OPT.apply_gradients(zip(Q_network_1_grad, self.Q_network_1.trainable_variables))

        with tf.GradientTape() as tape:
            Q_2 = self.Q_network_2([state, action])
            Q_2_loss = tf.reduce_mean(losses.mean_squared_error(Q_, Q_2))
        Q_network_2_grad = tape.gradient(Q_2_loss, self.Q_network_2.trainable_variables)
        self.Q2_OPT.apply_gradients(zip(Q_network_2_grad, self.Q_network_2.trainable_variables))

        self._update_target_value_network()
        return (
            np.array(Q_1_loss, dtype=np.float64),
            np.array(Q_2_loss, dtype=np.float64),
            np.array(policy_loss, dtype=np.float64),
            np.array(value_loss, dtype=np.float64),
            np.array(alpha_loss, dtype=np.float64),
            np.exp(self.log_aplha)
        )

    def save_weights(self, path):
        self.policy_network.save_weights(path + '-policy_network.h5')
        self.value_network.save_weights(path + '-value_network.h5')
        self.Q_network_1.save_weights(path + '-Q_network_1.h5')
        self.Q_network_2.save_weights(path + '-Q_network_2.h5')

    def load_weights(self, path):
        self.policy_network.load_weights(path + '-policy_network.h5')
        self.value_network.load_weights(path + '-value_network.h5')
        self.Q_network_1.load_weights(path + '-Q_network_1.h5')
        self.Q_network_2.load_weights(path + '-Q_network_2.h5')

if __name__ == '__main__':
    RENDER = False
    EPISODES = 2000
    BATCH_SIZE = 256
    env = gym.make('LunarLanderContinuous-v2')
    agent = SoftActorCritic((8), 2)
    # agent.load_weights('./LunarLanderContinuous-v2')
    loss_list = []
    reward_list = []
    _100_window_reward_list = []
    f = open('log.txt', 'w')
    for e in range(EPISODES):
        state = env.reset()
        rewards = 0
        while True:
            if RENDER:
                env.render()
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards += reward
            agent.save_memory(state, action, reward, next_state, done)
            state = copy.deepcopy(next_state)

            if done: break

        Q1_loss, Q2_loss, policy_loss, value_loss, alpha_loss, alpha = agent.update_weights(BATCH_SIZE)
        loss_list.append(np.sum([Q1_loss, Q2_loss, policy_loss, value_loss]))
        reward_list.append(rewards)
        _100_window_reward = sum(reward_list[-100:]) / len(reward_list[-100:])
        _100_window_reward_list.append(_100_window_reward)
        log = """
        ==============================================================================
        |>episode: {}/{}
        |>memory length: {}
        |>losses 
        |    >>
        |        Q1_loss: {}, Q2_loss: {}, 
        |        policy_loss: {}, value_loss: {}
        |        alpha: {}, alpha_loss: {}
        |    << 
        |>score: {}, avg score: {}
        ==============================================================================
        """.format(
            e + 1, EPISODES, len(agent.replay_buffer),
            Q1_loss, Q2_loss, policy_loss, value_loss, alpha, alpha_loss,
            rewards, _100_window_reward
        )
        f.write(log)
        print("episode: {}/{}, score: {}, avg_score: {}".format(e+1, EPISODES, rewards, _100_window_reward))
        agent.save_weights('./LunarLanderContinuous-v2')
    f.close()
    plt.plot(reward_list)
    plt.plot(_100_window_reward_list)
    plt.show()
    plt.plot(loss_list)
    plt.show()

reward 曲线

【强化学习】tensorflow2.x 构造 SoftActorCritic(SAC) 训练 LunarLanderContinuous-v2_第1张图片

你可能感兴趣的:(强化学习,python,深度学习,tensorflow,深度学习,python,人工智能)