DQN-keras

DQN NIPS 2015

DQN-keras_第1张图片

import gym
import random
import numpy as np

import skimage
from skimage import transform, color, exposure
from collections import deque

import tensorflow as tf
from keras import backend as K

from keras.models import Sequential
from keras.layers.convolutional import Convolution2D
from keras.layers.core import Dense, Activation, Flatten
from keras.optimizers import Adam

ACTIONS = 3
GAMMA = 0.99
OBSERVE = 320
EXPLORE = 200000
FINAL_EPSILON = 0.0001
INITIAL_EPSILON = 0.005
REPLAY_MEMORY = 50000
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

def process_image(obs):
    obs = skimage.color.rgb2gray(obs)
    obs = skimage.transform.resize(obs,(80,80))
    obs = skimage.exposure.rescale_intensity(obs,out_range=(0,255))/255.0
    return obs

def build_model():
    print("Now we build the model")
    model = Sequential()
    model.add(Convolution2D(32, (8, 8), strides=(4, 4), padding='same',input_shape=(80,80,4)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, (4, 4), strides=(2, 2), padding='same'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, (3, 3), strides=(1, 1), padding='same'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(3))
        
    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    return model

if __name__ == "__main__":
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    env = gym.make("xxx")
    buff = deque(maxlen=REPLAY_MEMORY)
    model = build_model()
    epsilon = INITIAL_EPSILON
    
    print("Now we load weight")
    try:
        model.load_weights('model.h5')
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    step = 0
    for i in range(2000):
        obs = env.reset()
        
        x_t = process_image(obs)
        s_t = np.stack((x_t,x_t,x_t,x_t),axis=2)
        s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])

        for t in range(10000):
            loss = 0
            Q_sa = 0
            action_index = 0
            a_t = np.zeros(ACTIONS)
            
            if random.random() <= epsilon:
                print("--------------Random Action-----------------")
                action_index = random.randrange(ACTIONS)
                a_t[action_index] = 1
            else:
                q = model.predict(s_t)
                action_index = np.argmax(q)
                a_t[action_index] = 1
            
            if epsilon > FINAL_EPSILON and step > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

            next_obs, r_t, teminal, info = env.step()
            
            x_t1 = process_image(next_obs)
            x_t1 = x_t1.reshape(1,x_t1.shape[0],x_t1.shape[1],1)
            s_t1 = np.append(x_t1,s_t[:,:,:,:3],axis=3)
            
            buff.append((s_t, action_index, r_t, s_t1, terminal))
            
            if step > OBSERVE:
                batch_size = min(BATCH_SIZE , len(buff))
                minibatch = random.sample(buff, batch_size)
                state_t, action_t, reward_t, state_t1, terminal_t = zip(*minibatch)
                state_t = np.concatenate(state_t)
                state_t1 = np.concatenate(state_t1)
                targets = model.predict(state_t)
                Q_sa = model.predict(state_t1)
                targets[range(BATCH_SIZE), action_t] = reward_t + GAMMA*np.max(Q_sa, axis=1)*np.invert(terminal_t)
                loss += model.train_on_batch(state_t, targets)
            
            s_t = s_t1
            step += 1
            
            if step % 1000 == 0:
                print("Now we save model")
                model.save_weights("save_model.h5")
            print("EPISODE ",i,"/ TIMESTEP ",step,"/ EPSILON ",epsilon,"/ ACTION ",action_index,"/ REWARD ",r_t,"/ Q_MAX ",np.max(Q_sa),"/ Loss ",loss)

 

你可能感兴趣的:(cnn)