from tensorflow.keras import Sequential, layers
from collections import deque
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import copy
import gym
import os
import cv2
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.InteractiveSession(config=config)
class DQNAgent:
def __init__(self, env):
self._env = env
self.memory = deque(maxlen=128)
self.gamma = 0.5
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.2
self.simple_size = 16
self.model = self._Build_Deep_Q_Network()
self.opt = tf.keras.optimizers.Adam()
def _Build_Deep_Q_Network(self):
model = Sequential()
model.add(layers.Conv2D(filters=32, kernel_size=(3, 3), strides=1, padding='valid'))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same'))
model.add(layers.Flatten())
model.add(layers.Dense(units=128, activation='relu'))
model.add(layers.Dense(units=32, activation='relu'))
model.add(layers.Dense(units=6))
model.build(input_shape=(None, 40, 40, 1))
model.summary()
return model
def save_memory(self, state, action, reward, next_state, done):
_state = np.array([state], dtype=np.float64)/255.0
_next_state = np.array([next_state], dtype=np.float64)/255.0
self.memory.append((_state, action, reward, _next_state, done))
def train(self):
batchs = min(self.simple_size, len(self.memory))
training_data = random.sample(self.memory, batchs)
loss_sum = 0.
count = 0
for i in training_data:
state, action, reward, next_state, done = i
y_reward = reward
if not done:
y_reward = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
_y = self.model.predict(state)
_y[0][action] = y_reward
with tf.GradientTape() as tape:
loss = tf.losses.mean_squared_error(y_true=_y, y_pred=self.model(state))
grad = tape.gradient(loss, self.model.trainable_variables)
self.opt.apply_gradients(zip(grad, self.model.trainable_variables))
loss_sum += loss
count += 1
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
print(" loss : {}".format(loss_sum / count))
return loss_sum / count
def choice_action(self, state):
_state = np.array([state], dtype=np.float64)/255.0
if np.random.rand() <= self.epsilon:
return self._env.action_space.sample()
else:
action = self.model.predict(_state)
return np.argmax(action[0])
def preprocess(state):
state = state[20:]
state = np.mean(state, axis=2)
state = cv2.resize(state, dsize=(40, 40))
state = np.reshape(state, newshape=(40, 40, 1))
state[state < 10] = 0
state[state >= 10] = 255
return state
if __name__ == '__main__':
env = gym.make('SpaceInvaders-v0')
agent = DQNAgent(env)
episodes = 10000
all_loss = []
for e in range(episodes):
state = env.reset()
state = preprocess(state)
all_rewards = 0
for time_t in range(5000):
env.render()
action = agent.choice_action(state)
next_state, reward, done, _ = env.step(action)
next_state = preprocess(next_state)
if reward != 0.0:
agent.save_memory(state, action, reward, next_state, done)
state = copy.deepcopy(next_state)
all_rewards += reward
if done:
print("episode: {}/{}, steps: {}, epsilon: {}"
.format(e + 1, episodes, time_t, agent.epsilon))
break
loss = agent.train()
all_loss.append(loss[0])
plt.plot(all_loss)
plt.show()