强化学习-DQN-玩SpaceInvaders-v0

from tensorflow.keras import Sequential, layers
from collections import deque

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

import random
import copy
import gym
import os
import cv2

# 降低tensorflow警告等级
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# 配置GPU内存
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.InteractiveSession(config=config)

class DQNAgent:
    def __init__(self, env):
        self._env = env
        self.memory = deque(maxlen=128)
        self.gamma = 0.5
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.2

        self.simple_size = 16

        self.model = self._Build_Deep_Q_Network()
        self.opt = tf.keras.optimizers.Adam()

    def _Build_Deep_Q_Network(self):
        model = Sequential()

        model.add(layers.Conv2D(filters=32, kernel_size=(3, 3), strides=1, padding='valid'))
        model.add(layers.Activation('relu'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same'))

        model.add(layers.Flatten())
        model.add(layers.Dense(units=128, activation='relu'))
        model.add(layers.Dense(units=32, activation='relu'))
        model.add(layers.Dense(units=6))

        model.build(input_shape=(None, 40, 40, 1))
        model.summary()
        return model

    def save_memory(self, state, action, reward, next_state, done):

        _state = np.array([state], dtype=np.float64)/255.0
        _next_state = np.array([next_state], dtype=np.float64)/255.0

        self.memory.append((_state, action, reward, _next_state, done))


    def train(self):
        batchs = min(self.simple_size, len(self.memory))
        training_data = random.sample(self.memory, batchs)

        loss_sum = 0.
        count = 0
        for i in training_data:
            state, action, reward, next_state, done = i

            y_reward = reward
            if not done:
                y_reward = reward + self.gamma * np.amax(self.model.predict(next_state)[0])

            _y = self.model.predict(state)
            _y[0][action] = y_reward

            with tf.GradientTape() as tape:
                loss = tf.losses.mean_squared_error(y_true=_y, y_pred=self.model(state))
                grad = tape.gradient(loss, self.model.trainable_variables)
                self.opt.apply_gradients(zip(grad, self.model.trainable_variables))
            loss_sum += loss

            count += 1

        if self.epsilon > self.epsilon_min:

            self.epsilon *= self.epsilon_decay

        print(" loss : {}".format(loss_sum / count))
        return loss_sum / count


    def choice_action(self, state):
        _state = np.array([state], dtype=np.float64)/255.0
        if np.random.rand() <= self.epsilon:
            return self._env.action_space.sample()
        else:
            action = self.model.predict(_state)
            return np.argmax(action[0])

def preprocess(state):
    state = state[20:]
    state = np.mean(state, axis=2)
    state = cv2.resize(state, dsize=(40, 40))
    state = np.reshape(state, newshape=(40, 40, 1))

    state[state < 10] = 0
    state[state >= 10] = 255

    return state

if __name__ == '__main__':
    env = gym.make('SpaceInvaders-v0')
    agent = DQNAgent(env)

    episodes = 10000

    all_loss = []
    for e in range(episodes):
        state = env.reset()
        state = preprocess(state)

        all_rewards = 0

        for time_t in range(5000):

            env.render()

            action = agent.choice_action(state)

            next_state, reward, done, _ = env.step(action)
            next_state = preprocess(next_state)

            if reward != 0.0:
                agent.save_memory(state, action, reward, next_state, done)

            state = copy.deepcopy(next_state)

            all_rewards += reward

            if done:
                print("episode: {}/{}, steps: {}, epsilon: {}"
                      .format(e + 1, episodes, time_t, agent.epsilon))
                break

        loss = agent.train()
        all_loss.append(loss[0])

    plt.plot(all_loss)
    plt.show()

你可能感兴趣的:(机器学习,python)