from tensorflow.keras import optimizers, layers, models, losses
from collections import deque
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import gym
import copy
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
def build_model():
actor_input = [layers.Input(shape=[8])]
A_H_1 = layers.Dense(units=1024, activation='relu')(actor_input[0])
A_H_2 = layers.Dense(units=1024, activation='relu')(A_H_1)
A_out = layers.Dense(units=2, activation='tanh')(A_H_2)
actor_model = models.Model(inputs=actor_input, outputs=A_out)
critic_input = [layers.Input(shape=[8]), layers.Input(shape=[2])]
C_concat = layers.concatenate(critic_input)
C_H_1 = layers.Dense(units=1024, activation='relu')(C_concat)
C_H_2 = layers.Dense(units=1024, activation='relu')(C_H_1)
C_out = layers.Dense(units=1)(C_H_2)
critic_model = models.Model(inputs=critic_input, outputs=C_out)
return actor_model, critic_model
class DDPG(object):
def __init__(self, actor, critic):
actor_LR = 1e-3
critic_LR = 2e-3
self.action_range = (-1, 1)
self.actor = actor
self.critic = critic
self.actor_target = models.clone_model(actor)
self.critic_target = models.clone_model(critic)
self.actor_OPT = optimizers.Adam(actor_LR)
self.critic_OPT = optimizers.Adam(critic_LR)
self.memory = deque(maxlen=65536)
self.simple_size = 16384
self.var = 1
self.var_min = 0.3
self.target_update_episodes = 30
self.var_decay = 0.9995
self.greedy = False
self.gamma = 0.997
self.ema = tf.train.ExponentialMovingAverage(decay=0.995)
self.actor_target.set_weights(self.actor.get_weights())
self.critic_target.set_weights(self.critic.get_weights())
def save_memory(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def choice_action(self, state):
a = np.array(self.actor(np.array([state]))[0])
return a if self.greedy else np.clip(
np.random.normal(a, self.var),
self.action_range[0],
self.action_range[1]
)
def train(self, episode):
batchs = min(self.simple_size, len(self.memory))
training_data = random.sample(self.memory, batchs)
states, actions, rewards, next_states, dones = [], [], [], [], []
for i in training_data:
state, action, reward, next_state, done = i
states.append(state)
actions.append(action)
rewards.append(reward)
next_states.append(next_state)
dones.append(done)
states = np.array(states, dtype=np.float64)
actions = np.array(actions, dtype=np.float64)
next_states = np.array(next_states, dtype=np.float64)
next_actions = self.actor_target.predict(next_states)
rewards = np.reshape(rewards, newshape=(-1, 1))
target_Q = rewards + np.reshape(1 - np.array(dones), newshape=(-1, 1)) * self.gamma * self.critic_target.predict([next_states, next_actions])
with tf.GradientTape() as tape:
predict_Q = self.critic([states, actions])
td_error = losses.mean_squared_error(target_Q, predict_Q)
critic_grads = tape.gradient(td_error, self.critic.trainable_weights)
self.critic_OPT.apply_gradients(zip(critic_grads, self.critic.trainable_weights))
print('td_error: {}'.format(tf.reduce_mean(td_error)), end='\t')
with tf.GradientTape() as tape:
_actions = self.actor(states)
Q = self.critic([states, _actions])
actor_loss = tf.negative(tf.reduce_mean(Q))
actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights)
self.actor_OPT.apply_gradients(zip(actor_grads, self.actor.trainable_weights))
if episode % self.target_update_episodes == 0:
self.soft_update()
if self.var >= self.var_min:
self.var *= self.var_decay
print('actor_loss: {}'.format(actor_loss))
def soft_update(self):
paras = self.actor.trainable_weights + self.critic.trainable_weights
self.ema.apply(paras)
for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
i.assign(self.ema.average(j))
if __name__ == '__main__':
env = gym.make('LunarLanderContinuous-v2')
actor, critic = build_model()
DDPG_agent = DDPG(actor, critic)
episodes = 5000
all_rewards = []
avg_rewards = []
for e in range(episodes):
state = env.reset()
rewards = 0
while True:
env.render()
action = DDPG_agent.choice_action(state)
next_state, reward, done, _ = env.step(action)
rewards += reward
DDPG_agent.save_memory(state, action, reward, next_state, done)
state = copy.deepcopy(next_state)
if done:
all_rewards.append(rewards)
avg_reward = sum(all_rewards)/len(all_rewards)
print("episode: {}/{}, rewards: {}, memory length: {}, evg score: {}, var: {}"
.format(e + 1, episodes, rewards, len(DDPG_agent.memory), avg_reward, DDPG_agent.var))
avg_rewards.append(avg_reward)
break
DDPG_agent.train(episode=e + 1)
plt.plot(all_rewards)
plt.plot(avg_rewards)
plt.show()