【强化学习】tensorflow2.0构造DDPG训练LunarLanderContinuous-v2

from tensorflow.keras import optimizers, layers, models, losses
from collections import deque
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import gym
import copy

# 配置GPU内存
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

def build_model():
    actor_input = [layers.Input(shape=[8])]
    A_H_1 = layers.Dense(units=1024, activation='relu')(actor_input[0])
    A_H_2 = layers.Dense(units=1024, activation='relu')(A_H_1)
    A_out = layers.Dense(units=2, activation='tanh')(A_H_2)
    actor_model = models.Model(inputs=actor_input, outputs=A_out)

    critic_input = [layers.Input(shape=[8]), layers.Input(shape=[2])]
    C_concat = layers.concatenate(critic_input)
    C_H_1 = layers.Dense(units=1024, activation='relu')(C_concat)
    C_H_2 = layers.Dense(units=1024, activation='relu')(C_H_1)
    C_out = layers.Dense(units=1)(C_H_2)
    critic_model = models.Model(inputs=critic_input, outputs=C_out)

    return actor_model, critic_model

class DDPG(object):
    def __init__(self, actor, critic):
        actor_LR = 1e-3
        critic_LR = 2e-3

        self.action_range = (-1, 1)

        self.actor = actor
        self.critic = critic


        self.actor_target = models.clone_model(actor)
        self.critic_target = models.clone_model(critic)

        self.actor_OPT = optimizers.Adam(actor_LR)
        self.critic_OPT = optimizers.Adam(critic_LR)

        self.memory = deque(maxlen=65536)
        self.simple_size = 16384
        self.var = 1
        self.var_min = 0.3
        self.target_update_episodes = 30
        self.var_decay = 0.9995
        self.greedy = False
        self.gamma = 0.997
        self.ema = tf.train.ExponentialMovingAverage(decay=0.995)

        self.actor_target.set_weights(self.actor.get_weights())
        self.critic_target.set_weights(self.critic.get_weights())

    def save_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choice_action(self, state):
        a = np.array(self.actor(np.array([state]))[0])
        return a if self.greedy else np.clip(
            np.random.normal(a, self.var),
            self.action_range[0],
            self.action_range[1]
        )

    def train(self, episode):
        batchs = min(self.simple_size, len(self.memory))
        training_data = random.sample(self.memory, batchs)

        states, actions, rewards, next_states, dones = [], [], [], [], []

        for i in training_data:
            state, action, reward, next_state, done = i
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)

        states = np.array(states, dtype=np.float64)
        actions = np.array(actions, dtype=np.float64)

        next_states = np.array(next_states, dtype=np.float64)
        next_actions = self.actor_target.predict(next_states)

        rewards = np.reshape(rewards, newshape=(-1, 1))

        target_Q = rewards + np.reshape(1 - np.array(dones), newshape=(-1, 1)) * self.gamma * self.critic_target.predict([next_states, next_actions])

        with tf.GradientTape() as tape:
            predict_Q = self.critic([states, actions])
            td_error = losses.mean_squared_error(target_Q, predict_Q)
        critic_grads = tape.gradient(td_error, self.critic.trainable_weights)
        self.critic_OPT.apply_gradients(zip(critic_grads, self.critic.trainable_weights))

        print('td_error: {}'.format(tf.reduce_mean(td_error)), end='\t')

        with tf.GradientTape() as tape:
            _actions = self.actor(states)
            Q = self.critic([states, _actions])
            actor_loss = tf.negative(tf.reduce_mean(Q))
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights)
        self.actor_OPT.apply_gradients(zip(actor_grads, self.actor.trainable_weights))

        if episode % self.target_update_episodes == 0:
            self.soft_update()

        if self.var >= self.var_min:
            self.var *= self.var_decay

        print('actor_loss: {}'.format(actor_loss))

    def soft_update(self):
        paras = self.actor.trainable_weights + self.critic.trainable_weights
        self.ema.apply(paras)
        for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
            i.assign(self.ema.average(j))

if __name__ == '__main__':
    env = gym.make('LunarLanderContinuous-v2')

    actor, critic = build_model()

    DDPG_agent = DDPG(actor, critic)

    episodes = 5000

    all_rewards = []
    avg_rewards = []

    for e in range(episodes):
        state = env.reset()
        rewards = 0
        while True:
            env.render()
            action = DDPG_agent.choice_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards += reward
            DDPG_agent.save_memory(state, action, reward, next_state, done)
            state = copy.deepcopy(next_state)

            if done:
                all_rewards.append(rewards)
                avg_reward = sum(all_rewards)/len(all_rewards)
                print("episode: {}/{}, rewards: {}, memory length: {}, evg score: {}, var: {}"
                      .format(e + 1, episodes, rewards, len(DDPG_agent.memory), avg_reward, DDPG_agent.var))
                avg_rewards.append(avg_reward)
                break

        DDPG_agent.train(episode=e + 1)

    plt.plot(all_rewards)
    plt.plot(avg_rewards)
    plt.show()

【强化学习】tensorflow2.0构造DDPG训练LunarLanderContinuous-v2_第1张图片

你可能感兴趣的:(强化学习,深度学习,python,深度学习,机器学习,python)