NAS with RL(Using TensorFlow)

目录

代码一:

train.py

net_manager.py

cnn.py

reinforce.py

代码二:

train.py

controller.py

model.py

manager.py

nascell.py


代码一:

代码地址:nascell-automl-master

修改后代码(需要新建几个python文件):

train.py

import numpy as np
import tensorflow.compat.v1 as tf

import tensorflow_addons as tfa
import argparse
import datetime

from cnn import CNN
from net_manager import NetManager
from reinforce import Reinforce

from tensorflow.examples.tutorials.mnist import input_data


def parse_args():
    desc = "TensorFlow implementation of 'Neural Architecture Search with Reinforcement Learning'"
    parser = argparse.ArgumentParser(description=desc)

    parser.add_argument('--max_layers', default=2)

    args = parser.parse_args()
    args.max_layers = int(args.max_layers)
    return args


'''
    Policy network is a main network for searching optimal architecture
    it uses NAS - Neural Architecture Search recurrent network cell.
    https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/contrib/rnn/python/ops/rnn_cell.py#L1363

    Args:
        state: current state of required topology
        max_layers: maximum number of layers
    Returns:
        3-D tensor with new state (new topology)
'''


def policy_network(state, max_layers):
    with tf.name_scope("policy_network"):
        nas_cell = tfa.rnn.NASCell(4 * max_layers)
        outputs, state = tf.nn.dynamic_rnn(
            nas_cell,
            tf.expand_dims(state, -1),
            dtype=tf.float32
        )
        bias = tf.Variable([0.05] * 4 * max_layers)
        outputs = tf.nn.bias_add(outputs, bias)
        print("outputs: ", outputs, outputs[:, -1:, :],
              tf.slice(outputs, [0, 4 * max_layers - 1, 0], [1, 1, 4 * max_layers]))
        # return tf.slice(outputs, [0, 4*max_layers-1, 0], [1, 1, 4*max_layers]) # Returned last output of rnn
        return outputs[:, -1:, :]


def train(mnist):
    global args
    sess = tf.Session()
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.1
    learning_rate = tf.train.exponential_decay(0.99, global_step,
                                               500, 0.96, staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step)
    net_manager = NetManager(num_input=784,
                             num_classes=10,
                             learning_rate=0.001,
                             mnist=mnist,
                             bathc_size=100)

    MAX_EPISODES = 2500
    step = 0
    state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers], dtype=np.float32)
    pre_acc = 0.0
    total_rewards = 0
    for i_episode in range(MAX_EPISODES):
        action = reinforce.get_action(state)
        print("ca:", action)
        if all(ai > 0 for ai in action[0][0]):
            reward, pre_acc = net_manager.get_reward(action, step, pre_acc)
            print("=====>", reward, pre_acc)
        else:
            reward = -1.0
        total_rewards += reward

        # In our sample action is equal state
        state = action[0]
        reinforce.storeRollout(state, reward)

        step += 1
        ls = reinforce.train_step(1)
        log_str = "current time:  " + str(datetime.datetime.now().time()) + " episode:  " + str(
            i_episode) + " loss:  " + str(ls) + " last_state:  " + str(state) + " last_reward:  " + str(reward) + "\n"
        log = open("lg3.txt", "a+")
        log.write(log_str)
        log.close()
        print(log_str)


def main():
    global args
    args = parse_args()

    mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
    train(mnist)


if __name__ == '__main__':
    tf.disable_v2_behavior()
    main()

net_manager.py

import tensorflow.compat.v1 as tf
from cnn import CNN


class NetManager():
    def __init__(self, num_input, num_classes, learning_rate, mnist,
                 max_step_per_action=5500 * 3,
                 bathc_size=100,
                 dropout_rate=0.85):

        self.num_input = num_input
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.mnist = mnist

        self.max_step_per_action = max_step_per_action
        self.bathc_size = bathc_size
        self.dropout_rate = dropout_rate

    def get_reward(self, action, step, pre_acc):
        action = [action[0][0][x:x + 4] for x in range(0, len(action[0][0]), 4)]
        cnn_drop_rate = [c[3] for c in action]
        with tf.Graph().as_default() as g:
            with g.container('experiment' + str(step)):
                model = CNN(self.num_input, self.num_classes, action)
                loss_op = tf.reduce_mean(model.loss)
                optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
                train_op = optimizer.minimize(loss_op)

                with tf.Session() as train_sess:
                    init = tf.global_variables_initializer()
                    train_sess.run(init)

                    for step in range(self.max_step_per_action):
                        batch_x, batch_y = self.mnist.train.next_batch(self.bathc_size)
                        feed = {model.X: batch_x,
                                model.Y: batch_y,
                                model.dropout_keep_prob: self.dropout_rate,
                                model.cnn_dropout_rates: cnn_drop_rate}
                        _ = train_sess.run(train_op, feed_dict=feed)

                        if step % 100 == 0:
                            # Calculate batch loss and accuracy
                            loss, acc = train_sess.run(
                                [loss_op, model.accuracy],
                                feed_dict={model.X: batch_x,
                                           model.Y: batch_y,
                                           model.dropout_keep_prob: 1.0,
                                           model.cnn_dropout_rates: [1.0] * len(cnn_drop_rate)})
                            print("Step " + str(step) +
                                  ", Minibatch Loss= " + "{:.4f}".format(loss) +
                                  ", Current accuracy= " + "{:.3f}".format(acc))
                    batch_x, batch_y = self.mnist.test.next_batch(10000)
                    loss, acc = train_sess.run(
                        [loss_op, model.accuracy],
                        feed_dict={model.X: batch_x,
                                   model.Y: batch_y,
                                   model.dropout_keep_prob: 1.0,
                                   model.cnn_dropout_rates: [1.0] * len(cnn_drop_rate)})
                    print("!!!!!!acc:", acc, pre_acc)
                    if acc - pre_acc <= 0.01:
                        return acc, acc
                    else:
                        return 0.01, acc

cnn.py

import tensorflow.compat.v1 as tf


class CNN():
    def __init__(self, num_input, num_classes, cnn_config):
        cnn = [c[0] for c in cnn_config]
        cnn_num_filters = [c[1] for c in cnn_config]
        max_pool_ksize = [c[2] for c in cnn_config]

        self.X = tf.placeholder(tf.float32,
                                [None, num_input],
                                name="input_X")
        self.Y = tf.placeholder(tf.int32, [None, num_classes], name="input_Y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, [], name="dense_dropout_keep_prob")
        self.cnn_dropout_rates = tf.placeholder(tf.float32, [len(cnn), ], name="cnn_dropout_keep_prob")

        Y = self.Y
        X = tf.expand_dims(self.X, -1)
        pool_out = X
        with tf.name_scope("Conv_part"):
            for idd, filter_size in enumerate(cnn):
                with tf.name_scope("L" + str(idd)):
                    conv_out = tf.layers.conv1d(
                        pool_out,
                        filters=cnn_num_filters[idd],
                        kernel_size=(int(filter_size)),
                        strides=1,
                        padding="SAME",
                        name="conv_out_" + str(idd),
                        activation=tf.nn.relu,
                        kernel_initializer=tf.initializers.glorot_normal(),
                        bias_initializer=tf.zeros_initializer
                    )
                    pool_out = tf.layers.max_pooling1d(
                        conv_out,
                        pool_size=(int(max_pool_ksize[idd])),
                        strides=1,
                        padding='SAME',
                        name="max_pool_" + str(idd)
                    )
                    pool_out = tf.nn.dropout(pool_out, self.cnn_dropout_rates[idd])

            flatten_pred_out = tf.layers.flatten(pool_out)
            self.logits = tf.layers.dense(flatten_pred_out, num_classes)

        self.prediction = tf.nn.softmax(self.logits, name="prediction")
        self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=Y, name="loss")
        correct_pred = tf.equal(tf.argmax(self.prediction, 1), tf.argmax(Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")

reinforce.py

import tensorflow.compat.v1 as tf
import random
import numpy as np


class Reinforce():
    def __init__(self, sess, optimizer, policy_network, max_layers, global_step,
                 division_rate=100.0,
                 reg_param=0.001,
                 discount_factor=0.99,
                 exploration=0.3):
        self.sess = sess
        self.optimizer = optimizer
        self.policy_network = policy_network
        self.division_rate = division_rate
        self.reg_param = reg_param
        self.discount_factor = discount_factor
        self.max_layers = max_layers
        self.global_step = global_step

        self.reward_buffer = []
        self.state_buffer = []

        self.create_variables()
        var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        self.sess.run(tf.variables_initializer(var_lists))

    def get_action(self, state):
        return self.sess.run(self.predicted_action, {self.states: state})
        if random.random() < self.exploration:
            return np.array([[random.sample(range(1, 35), 4 * self.max_layers)]])
        else:
            return self.sess.run(self.predicted_action, {self.states: state})

    def create_variables(self):
        with tf.name_scope("model_inputs"):
            # raw state representation
            tf.disable_eager_execution()
            self.states = tf.placeholder(tf.float32, [None, self.max_layers * 4], name="states")

        with tf.name_scope("predict_actions"):
            # initialize policy network
            with tf.variable_scope("policy_network"):
                self.policy_outputs = self.policy_network(self.states, self.max_layers)

            self.action_scores = tf.identity(self.policy_outputs, name="action_scores")

            self.predicted_action = tf.cast(tf.scalar_mul(self.division_rate, self.action_scores), tf.int32,
                                            name="predicted_action")

        # regularization loss
        policy_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy_network")

        # compute loss and gradients
        with tf.name_scope("compute_gradients"):
            # gradients for selecting action from policy network
            self.discounted_rewards = tf.placeholder(tf.float32, (None,), name="discounted_rewards")

            with tf.variable_scope("policy_network", reuse=True):
                self.logprobs = self.policy_network(self.states, self.max_layers)
                print("self.logprobs", self.logprobs)

            # compute policy loss and regularization loss
            self.cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logprobs[:, -1, :],
                                                                              labels=self.states)
            self.pg_loss = tf.reduce_mean(self.cross_entropy_loss)
            self.reg_loss = tf.reduce_sum(
                [tf.reduce_sum(tf.square(x)) for x in policy_network_variables])  # Regularization
            self.loss = self.pg_loss + self.reg_param * self.reg_loss

            # compute gradients
            self.gradients = self.optimizer.compute_gradients(self.loss)

            # compute policy gradients
            for i, (grad, var) in enumerate(self.gradients):
                if grad is not None:
                    self.gradients[i] = (grad * self.discounted_rewards, var)

            # training update
            with tf.name_scope("train_policy_network"):
                # apply gradients to update policy network
                self.train_op = self.optimizer.apply_gradients(self.gradients, global_step=self.global_step)

    def storeRollout(self, state, reward):
        self.reward_buffer.append(reward)
        self.state_buffer.append(state[0])

    def train_step(self, steps_count):
        states = np.array(self.state_buffer[-steps_count:]) / self.division_rate
        rewars = self.reward_buffer[-steps_count:]
        _, ls = self.sess.run([self.train_op, self.loss],
                              {self.states: states,
                               self.discounted_rewards: rewars})
        return ls

代码二:

代码地址:neural-architecture-search-master

修改后代码:

train.py

import numpy as np
import csv

import tensorflow as tf
from keras import backend as K
from keras.datasets import cifar10
from keras.utils import to_categorical

from controller import Controller, StateSpace
from manager import NetworkManager
from model import model_fn
tf.compat.v1.disable_eager_execution()
tf.get_logger().setLevel('ERROR')
# create a shared session between Keras and Tensorflow
policy_sess = tf.compat.v1.Session()
tf.compat.v1.keras.backend.set_session(policy_sess)

NUM_LAYERS = 4  # number of layers of the state space
MAX_TRIALS = 250  # maximum number of models generated

MAX_EPOCHS = 10  # maximum number of epochs to train
CHILD_BATCHSIZE = 128  # batchsize of the child models
EXPLORATION = 0.8  # high exploration for the first 1000 steps
REGULARIZATION = 1e-3  # regularization strength
CONTROLLER_CELLS = 32  # number of cells in RNN controller
EMBEDDING_DIM = 20  # dimension of the embeddings for each state
ACCURACY_BETA = 0.8  # beta value for the moving average of the accuracy
CLIP_REWARDS = 0.0  # clip rewards in the [-0.05, 0.05] range
RESTORE_CONTROLLER = True  # restore controller to continue training

# construct a state space
state_space = StateSpace()

# add states
state_space.add_state(name='kernel', values=[1, 3])
state_space.add_state(name='filters', values=[16, 32, 64])

# print the state space being searched
state_space.print_state_space()

# prepare the training data for the NetworkManager
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

dataset = [x_train, y_train, x_test, y_test]  # pack the dataset for the NetworkManager

previous_acc = 0.0
total_reward = 0.0

with policy_sess.as_default():
    # create the Controller and build the internal policy network
    controller = Controller(policy_sess, NUM_LAYERS, state_space,
                            reg_param=REGULARIZATION,
                            exploration=EXPLORATION,
                            controller_cells=CONTROLLER_CELLS,
                            embedding_dim=EMBEDDING_DIM,
                            restore_controller=RESTORE_CONTROLLER)

# create the Network Manager
manager = NetworkManager(dataset, epochs=MAX_EPOCHS, child_batchsize=CHILD_BATCHSIZE, clip_rewards=CLIP_REWARDS,
                         acc_beta=ACCURACY_BETA)

# get an initial random state space if controller needs to predict an
# action from the initial state
state = state_space.get_random_state_space(NUM_LAYERS)
print("Initial Random State : ", state_space.parse_state_space_list(state))
print()

# clear the previous files
controller.remove_files()

# train for number of trails
for trial in range(MAX_TRIALS):
    with policy_sess.as_default():
        tf.compat.v1.keras.backend.set_session(policy_sess)
        actions = controller.get_action(state)  # get an action for the previous state

    # print the action probabilities
    state_space.print_actions(actions)
    print("Predicted actions : ", state_space.parse_state_space_list(actions))

    # build a model, train and get reward and accuracy from the network manager
    reward, previous_acc = manager.get_rewards(model_fn, state_space.parse_state_space_list(actions))
    print("Rewards : ", reward, "Accuracy : ", previous_acc)

    with policy_sess.as_default():
        tf.compat.v1.keras.backend.set_session(policy_sess)

        total_reward += reward
        print("Total reward : ", total_reward)

        # actions and states are equivalent, save the state and reward
        state = actions
        controller.store_rollout(state, reward)

        # train the controller on the saved state and the discounted rewards
        loss = controller.train_step()
        print("Trial %d: Controller loss : %0.6f" % (trial + 1, loss))

        # write the results of this trial into a file
        with open('train_history.csv', mode='a+') as f:
            data = [previous_acc, reward]
            data.extend(state_space.parse_state_space_list(state))
            writer = csv.writer(f)
            writer.writerow(data)
    print()

print("Total Reward : ", total_reward)

controller.py

import numpy as np
import time
import pprint
from collections import OrderedDict

from keras import backend as K
import tensorflow as tf

import os

if not os.path.exists('weights/'):
    os.makedirs('weights/')
tf.compat.v1.disable_eager_execution()


class StateSpace:
    '''
    State Space manager

    Provides utilit functions for holding "states" / "actions" that the controller
    must use to train and predict.

    Also provides a more convenient way to define the search space
    '''

    def __init__(self):
        self.states = OrderedDict()
        self.state_count_ = 0

    def add_state(self, name, values):
        '''
        Adds a "state" to the state manager, along with some metadata for efficient
        packing and unpacking of information required by the RNN Controller.

        Stores metadata such as:
        -   Global ID
        -   Name
        -   Valid Values
        -   Number of valid values possible
        -   Map from value ID to state value
        -   Map from state value to value ID

        Args:
            name: name of the state / action
            values: valid values that this state can take

        Returns:
            Global ID of the state. Can be used to refer to this state later.
        '''
        index_map = {}
        for i, val in enumerate(values):
            index_map[i] = val

        value_map = {}
        for i, val in enumerate(values):
            value_map[val] = i

        metadata = {
            'id': self.state_count_,
            'name': name,
            'values': values,
            'size': len(values),
            'index_map_': index_map,
            'value_map_': value_map,
        }
        self.states[self.state_count_] = metadata
        self.state_count_ += 1

        return self.state_count_ - 1

    def embedding_encode(self, id, value):
        '''
        Embedding index encode the specific state value

        Args:
            id: global id of the state
            value: state value

        Returns:
            embedding encoded representation of the state value
        '''
        state = self[id]
        size = state['size']
        value_map = state['value_map_']
        value_idx = value_map[value]

        one_hot = np.zeros((1, size), dtype=np.float32)
        one_hot[np.arange(1), value_idx] = value_idx + 1
        return one_hot

    def get_state_value(self, id, index):
        '''
        Retrieves the state value from the state value ID

        Args:
            id: global id of the state
            index: index of the state value (usually from argmax)

        Returns:
            The actual state value at given value index
        '''
        state = self[id]
        index_map = state['index_map_']

        if (type(index) == list or type(index) == np.ndarray) and len(index) == 1:
            index = index[0]

        value = index_map[index]
        return value

    def get_random_state_space(self, num_layers):
        '''
        Constructs a random initial state space for feeding as an initial value
        to the Controller RNN

        Args:
            num_layers: number of layers to duplicate the search space

        Returns:
            A list of one hot encoded states
        '''
        states = []

        for id in range(self.size * num_layers):
            state = self[id]
            size = state['size']

            sample = np.random.choice(size, size=1)
            sample = state['index_map_'][sample[0]]
            state = self.embedding_encode(id, sample)
            states.append(state)
        return states

    def parse_state_space_list(self, state_list):
        '''
        Parses a list of one hot encoded states to retrieve a list of state values

        Args:
            state_list: list of one hot encoded states

        Returns:
            list of state values
        '''
        state_values = []
        for id, state_one_hot in enumerate(state_list):
            state_val_idx = np.argmax(state_one_hot, axis=-1)[0]
            value = self.get_state_value(id, state_val_idx)
            state_values.append(value)

        return state_values

    def print_state_space(self):
        ''' Pretty print the state space '''
        print('*' * 40, 'STATE SPACE', '*' * 40)

        pp = pprint.PrettyPrinter(indent=2, width=100)
        for id, state in self.states.items():
            pp.pprint(state)
            print()

    def print_actions(self, actions):
        ''' Print the action space properly '''
        print('Actions :')

        for id, action in enumerate(actions):
            if id % self.size == 0:
                print("*" * 20, "Layer %d" % (((id + 1) // self.size) + 1), "*" * 20)

            state = self[id]
            name = state['name']
            vals = [(n, p) for n, p in zip(state['values'], *action)]
            print("%s : " % name, vals)
        print()

    def __getitem__(self, id):
        return self.states[id % self.size]

    @property
    def size(self):
        return self.state_count_


class Controller:
    '''
    Utility class to manage the RNN Controller
    '''

    def __init__(self, policy_session, num_layers, state_space,
                 reg_param=0.001,
                 discount_factor=0.99,
                 exploration=0.8,
                 controller_cells=32,
                 embedding_dim=20,
                 clip_norm=0.0,
                 restore_controller=False):
        self.policy_session = policy_session  # type: tf.Session

        self.num_layers = num_layers
        self.state_space = state_space  # type: StateSpace
        self.state_size = self.state_space.size

        self.controller_cells = controller_cells
        self.embedding_dim = embedding_dim
        self.reg_strength = reg_param
        self.discount_factor = discount_factor
        self.exploration = exploration
        self.restore_controller = restore_controller
        self.clip_norm = clip_norm

        self.reward_buffer = []
        self.state_buffer = []

        self.cell_outputs = []
        self.policy_classifiers = []
        self.policy_actions = []
        self.policy_labels = []

        self.build_policy_network()

    def get_action(self, state):
        '''
        Gets a one hot encoded action list, either from random sampling or from
        the Controller RNN

        Args:
            state: a list of one hot encoded states, whose first value is used as initial
                state for the controller RNN

        Returns:
            A one hot encoded action list
        '''
        if np.random.random() < self.exploration:
            print("Generating random action to explore")
            actions = []

            for i in range(self.state_size * self.num_layers):
                state_ = self.state_space[i]
                size = state_['size']

                sample = np.random.choice(size, size=1)
                sample = state_['index_map_'][sample[0]]
                action = self.state_space.embedding_encode(i, sample)
                actions.append(action)
            return actions

        else:
            print("Prediction action from Controller")
            initial_state = self.state_space[0]
            size = initial_state['size']

            if state[0].shape != (1, size):
                state = state[0].reshape((1, size)).astype('int32')
            else:
                state = state[0]

            print("State input to Controller for Action : ", state.flatten())

            with self.policy_session.as_default():
                tf.compat.v1.keras.backend.set_session(self.policy_session)

                with tf.name_scope('action_prediction'):
                    pred_actions = self.policy_session.run(self.policy_actions, feed_dict={self.state_input: state})

                return pred_actions

    def build_policy_network(self):
        with self.policy_session.as_default():
            tf.compat.v1.keras.backend.set_session(self.policy_session)

            with tf.name_scope('controller'):
                with tf.compat.v1.variable_scope('policy_network'):

                    # state input is the first input fed into the controller RNN.
                    # the rest of the inputs are fed to the RNN internally
                    with tf.name_scope('state_input'):
                        state_input = tf.compat.v1.placeholder(dtype=tf.int32, shape=(1, None), name='state_input')

                    self.state_input = state_input

                    # we can use LSTM as the controller as well
                    nas_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(self.controller_cells)
                    cell_state = nas_cell.zero_state(batch_size=1, dtype=tf.float32)

                    embedding_weights = []

                    # for each possible state, create a new embedding. Reuse the weights for multiple layers.
                    with tf.compat.v1.variable_scope('embeddings', reuse=tf.compat.v1.AUTO_REUSE):
                        for i in range(self.state_size):
                            state_ = self.state_space[i]
                            size = state_['size']

                            # size + 1 is used so that 0th index is never updated and is "default" value
                            weights = tf.compat.v1.get_variable('state_embeddings_%d' % i,
                                                                shape=[size + 1, self.embedding_dim],
                                                                initializer=tf.compat.v1.initializers.random_uniform(
                                                                    -1., 1.))

                            embedding_weights.append(weights)

                        # initially, cell input will be 1st state input
                        embeddings = tf.nn.embedding_lookup(embedding_weights[0], state_input)

                    cell_input = embeddings

                    # we provide a flat list of chained input-output to the RNN
                    for i in range(self.state_size * self.num_layers):
                        state_id = i % self.state_size
                        state_space = self.state_space[i]
                        size = state_space['size']

                        with tf.name_scope('controller_output_%d' % i):
                            # feed the ith layer input (i-1 layer output) to the RNN
                            outputs, final_state = tf.compat.v1.nn.dynamic_rnn(nas_cell,
                                                                               cell_input,
                                                                               initial_state=cell_state,
                                                                               dtype=tf.float32)

                            # add a new classifier for each layers output
                            classifier = tf.compat.v1.layers.dense(outputs[:, -1, :], units=size,
                                                                   name='classifier_%d' % (i),
                                                                   reuse=False)
                            preds = tf.nn.softmax(classifier)

                            # feed the previous layer (i-1 layer output) to the next layers input, along with state
                            # take the class label
                            cell_input = tf.argmax(preds, axis=-1)
                            cell_input = tf.expand_dims(cell_input, -1, name='pred_output_%d' % (i))
                            cell_input = tf.cast(cell_input, tf.int32)
                            cell_input = tf.add(cell_input,
                                                1)  # we avoid using 0 so as to have a "default" embedding at 0th index

                            # embedding lookup of this state using its state weights ; reuse weights
                            cell_input = tf.nn.embedding_lookup(embedding_weights[state_id], cell_input,
                                                                name='cell_output_%d' % (i))

                            cell_state = final_state

                        # store the tensors for later loss computation
                        self.cell_outputs.append(cell_input)
                        self.policy_classifiers.append(classifier)
                        self.policy_actions.append(preds)

            policy_net_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
                                                               scope='policy_network')

            with tf.name_scope('optimizer'):
                self.global_step = tf.Variable(0, trainable=False)
                starter_learning_rate = 0.1
                learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, self.global_step,
                                                                     500, 0.95, staircase=True)

                tf.summary.scalar('learning_rate', learning_rate)

                self.optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=learning_rate)

            with tf.name_scope('losses'):
                self.discounted_rewards = tf.compat.v1.placeholder(tf.float32, shape=(None,), name='discounted_rewards')
                tf.summary.scalar('discounted_reward', tf.reduce_sum(self.discounted_rewards))

                # calculate sum of all the individual classifiers
                cross_entropy_loss = 0
                for i in range(self.state_size * self.num_layers):
                    classifier = self.policy_classifiers[i]
                    state_space = self.state_space[i]
                    size = state_space['size']

                    with tf.name_scope('state_%d' % (i + 1)):
                        labels = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, size),
                                                          name='cell_label_%d' % i)
                        self.policy_labels.append(labels)

                        ce_loss = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(logits=classifier, labels=labels)
                        tf.summary.scalar('state_%d_ce_loss' % (i + 1), tf.reduce_mean(ce_loss))

                    cross_entropy_loss += ce_loss

                policy_gradient_loss = tf.reduce_mean(cross_entropy_loss)
                reg_loss = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in policy_net_variables])  # Regularization

                # sum up policy gradient and regularization loss
                self.total_loss = policy_gradient_loss + self.reg_strength * reg_loss
                tf.summary.scalar('total_loss', self.total_loss)

                self.gradients = self.optimizer.compute_gradients(self.total_loss)

                with tf.name_scope('policy_gradients'):
                    # normalize gradients so that they dont explode if argument passed
                    if self.clip_norm is not None and self.clip_norm != 0.0:
                        norm = tf.constant(self.clip_norm, dtype=tf.float32)
                        gradients, vars = zip(*self.gradients)  # unpack the two lists of gradients and the variables
                        gradients, _ = tf.clip_by_global_norm(gradients, norm)  # clip by the norm
                        self.gradients = list(zip(gradients, vars))  # we need to set values later, convert to list

                    # compute policy gradients
                    for i, (grad, var) in enumerate(self.gradients):
                        if grad is not None:
                            self.gradients[i] = (grad * self.discounted_rewards, var)

                # training update
                with tf.name_scope("train_policy_network"):
                    # apply gradients to update policy network
                    self.train_op = self.optimizer.apply_gradients(self.gradients, global_step=self.global_step)

            self.summaries_op = tf.compat.v1.summary.merge_all()

            timestr = time.strftime("%Y-%m-%d-%H-%M-%S")
            filename = 'logs/%s' % timestr

            self.summary_writer = tf.compat.v1.summary.FileWriter(filename, graph=self.policy_session.graph)

            self.policy_session.run(tf.compat.v1.global_variables_initializer())
            self.saver = tf.compat.v1.train.Saver(max_to_keep=1)

            if self.restore_controller:
                path = tf.train.latest_checkpoint('weights/')

                if path is not None and tf.compat.v1.train.checkpoint_exists(path):
                    print("Loading Controller Checkpoint !")
                    self.saver.restore(self.policy_session, path)

    def store_rollout(self, state, reward):
        self.reward_buffer.append(reward)
        self.state_buffer.append(state)

        # dump buffers to file if it grows larger than 50 items
        if len(self.reward_buffer) > 20:
            with open('buffers.txt', mode='a+') as f:
                for i in range(20):
                    state_ = self.state_buffer[i]
                    state_list = self.state_space.parse_state_space_list(state_)
                    state_list = ','.join(str(v) for v in state_list)

                    f.write("%0.4f,%s\n" % (self.reward_buffer[i], state_list))

                print("Saved buffers to file `buffers.txt` !")

            self.reward_buffer = [self.reward_buffer[-1]]
            self.state_buffer = [self.state_buffer[-1]]

    def discount_rewards(self):
        '''
        Compute discounted rewards over the entire reward buffer

        Returns:
            Discounted reward value
        '''
        rewards = np.asarray(self.reward_buffer)
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0:
                running_add = 0
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards[-1]

    def train_step(self):
        '''
        Perform a single train step on the Controller RNN

        Returns:
            the training loss
        '''
        states = self.state_buffer[-1]
        label_list = []

        # parse the state space to get real value of the states,
        # then one hot encode them for comparison with the predictions
        state_list = self.state_space.parse_state_space_list(states)
        for id, state_value in enumerate(state_list):
            state_one_hot = self.state_space.embedding_encode(id, state_value)
            label_list.append(state_one_hot)

        # the initial input to the controller RNN
        state_input_size = self.state_space[0]['size']
        state_input = states[0].reshape((1, state_input_size)).astype('int32')
        print("State input to Controller for training : ", state_input.flatten())

        # the discounted reward value
        reward = self.discount_rewards()
        reward = np.asarray([reward]).astype('float32')

        feed_dict = {
            self.state_input: state_input,
            self.discounted_rewards: reward
        }

        # prepare the feed dict with the values of all the policy labels for each
        # of the Controller outputs
        for i, label in enumerate(label_list):
            feed_dict[self.policy_labels[i]] = label

        with self.policy_session.as_default():
            tf.compat.v1.keras.backend.set_session(self.policy_session)

            print("Training RNN (States ip) : ", state_list)
            print("Training RNN (Reward ip) : ", reward.flatten())
            _, loss, global_step = self.policy_session.run([self.train_op, self.total_loss,
                                                                     self.global_step],
                                                                    feed_dict=feed_dict)

            # self.summary_writer.add_summary(self.summaries_op, global_step)
            self.saver.save(self.policy_session, save_path='weights/controller.ckpt', global_step=self.global_step)

            # reduce exploration after many train steps
            if global_step != 0 and global_step % 20 == 0 and self.exploration > 0.5:
                self.exploration *= 0.99

        return loss

    def remove_files(self):
        files = ['train_history.csv', 'buffers.txt']

        for file in files:
            if os.path.exists(file):
                os.remove(file)

model.py

from keras.models import Model
from keras.layers import Input, Dense, Conv2D, GlobalAveragePooling2D


# generic model design
def model_fn(actions):
    # unpack the actions from the list
    kernel_1, filters_1, kernel_2, filters_2, kernel_3, filters_3, kernel_4, filters_4 = actions

    ip = Input(shape=(32, 32, 3))
    x = Conv2D(filters_1, (kernel_1, kernel_1), strides=(2, 2), padding='same', activation='relu')(ip)
    x = Conv2D(filters_2, (kernel_2, kernel_2), strides=(1, 1), padding='same', activation='relu')(x)
    x = Conv2D(filters_3, (kernel_3, kernel_3), strides=(2, 2), padding='same', activation='relu')(x)
    x = Conv2D(filters_4, (kernel_4, kernel_4), strides=(1, 1), padding='same', activation='relu')(x)
    x = GlobalAveragePooling2D()(x)
    x = Dense(10, activation='softmax')(x)

    model = Model(ip, x)
    return model

manager.py

import numpy as np

from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
tf.get_logger().setLevel('ERROR')


class NetworkManager:
    '''
    Helper class to manage the generation of subnetwork training given a dataset
    '''
    def __init__(self, dataset, epochs=5, child_batchsize=128, acc_beta=0.8, clip_rewards=0.0):
        '''
        Manager which is tasked with creating subnetworks, training them on a dataset, and retrieving
        rewards in the term of accuracy, which is passed to the controller RNN.

        Args:
            dataset: a tuple of 4 arrays (X_train, y_train, X_val, y_val)
            epochs: number of epochs to train the subnetworks
            child_batchsize: batchsize of training the subnetworks
            acc_beta: exponential weight for the accuracy
            clip_rewards: float - to clip rewards in [-range, range] to prevent
                large weight updates. Use when training is highly unstable.
        '''
        self.dataset = dataset
        self.epochs = epochs
        self.batchsize = child_batchsize
        self.clip_rewards = clip_rewards

        self.beta = acc_beta
        self.beta_bias = acc_beta
        self.moving_acc = 0.0

    def get_rewards(self, model_fn, actions):
        '''
        Creates a subnetwork given the actions predicted by the controller RNN,
        trains it on the provided dataset, and then returns a reward.

        Args:
            model_fn: a function which accepts one argument, a list of
                parsed actions, obtained via an inverse mapping from the
                StateSpace.
            actions: a list of parsed actions obtained via an inverse mapping
                from the StateSpace. It is in a specific order as given below:

                Consider 4 states were added to the StateSpace via the `add_state`
                method. Then the `actions` array will be of length 4, with the
                values of those states in the order that they were added.

                If number of layers is greater than one, then the `actions` array
                will be of length `4 * number of layers` (in the above scenario).
                The index from [0:4] will be for layer 0, from [4:8] for layer 1,
                etc for the number of layers.

                These action values are for direct use in the construction of models.

        Returns:
            a reward for training a model with the given actions
        '''
        with tf.compat.v1.Session(graph=tf.Graph()) as network_sess:
            tf.compat.v1.keras.backend.set_session(network_sess)

            # generate a submodel given predicted actions
            model = model_fn(actions)  # type: Model
            model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

            # unpack the dataset
            X_train, y_train, X_val, y_val = self.dataset

            # train the model using Keras methods
            model.fit(X_train, y_train, batch_size=self.batchsize, epochs=self.epochs,
                      verbose=1, validation_data=(X_val, y_val),
                      callbacks=[ModelCheckpoint('weights/temp_network.h5',
                                                 monitor='val_accuracy', verbose=1,
                                                 save_freq="epoch",
                                                 save_best_only=True,
                                                 save_weights_only=True)])

            # load best performance epoch in this training session
            model.load_weights('weights/temp_network.h5')

            # evaluate the model
            loss, acc = model.evaluate(X_val, y_val, batch_size=self.batchsize)

            # compute the reward
            reward = (acc - self.moving_acc)

            # if rewards are clipped, clip them in the range -0.05 to 0.05
            if self.clip_rewards:
                reward = np.clip(reward, -0.05, 0.05)

            # update moving accuracy with bias correction for 1st update
            if 0.0 < self.beta < 1.0:
                self.moving_acc = self.beta * self.moving_acc + (1 - self.beta) * acc
                self.moving_acc = self.moving_acc / (1 - self.beta_bias)
                self.beta_bias = 0

                reward = np.clip(reward, -0.1, 0.1)

            print()
            print("Manager: EWA Accuracy = ", self.moving_acc)

        # clean up resources and GPU memory
        network_sess.close()

        return reward, acc

nascell.py

from keras.engine import Layer
from keras import activations
from keras import initializers
from keras import regularizers
from keras import constraints
from keras import backend as K
from keras.layers import RNN
from keras.layers.recurrent import _generate_dropout_mask, _generate_dropout_ones

import warnings


# import tensorflow as tf
# import tensorflow.contrib.rnn as rnn

class NASCell(Layer):
    """Neural Architecture Search (NAS) recurrent network cell.

    This implements the recurrent cell from the paper:
    https://arxiv.org/abs/1611.01578
    Barret Zoph and Quoc V. Le.
    "Neural Architecture Search with Reinforcement Learning" Proc. ICLR 2017.

    The class uses an optional projection layer.

    # Arguments
        units: Positive integer, dimensionality of the output space.
        projection_units: (optional) Positive integer, The output dimensionality
            for the projection matrices.  If None, no projection is performed.
        activation: Activation function to use
            (see [activations](../activations.md)).
            If you pass None, no activation is applied
            (ie. "linear" activation: `a(x) = x`).
        recurrent_activation: Activation function to use
            for the recurrent step
            (see [activations](../activations.md)).
        projection_activation: Activation function to use
            for the projection step
            (see [activations](../activations.md)).
        use_bias: Boolean, whether the layer uses a bias vector.
        kernel_initializer: Initializer for the `kernel` weights matrix,
            used for the linear transformation of the inputs.
            (see [initializers](../initializers.md)).
        recurrent_initializer: Initializer for the `recurrent_kernel`
            weights matrix,
            used for the linear transformation of the recurrent state.
            (see [initializers](../initializers.md)).
        projection_initializer: Initializer for the `projection_kernel`
            weights matrix,
            used for the linear transformation of the projection step.
            (see [initializers](../initializers.md)).
        bias_initializer: Initializer for the bias vector
            (see [initializers](../initializers.md)).
        unit_forget_bias: Boolean.
            If True, add 1 to the bias of the forget gate at initialization.
            Setting it to true will also force `bias_initializer="zeros"`.
            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
        kernel_regularizer: Regularizer function applied to
            the `kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        recurrent_regularizer: Regularizer function applied to
            the `recurrent_kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        projection_regularizer: Regularizer function applied to
            the `projection_kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        bias_regularizer: Regularizer function applied to the bias vector
            (see [regularizer](../regularizers.md)).
        kernel_constraint: Constraint function applied to
            the `kernel` weights matrix
            (see [constraints](../constraints.md)).
        recurrent_constraint: Constraint function applied to
            the `recurrent_kernel` weights matrix
            (see [constraints](../constraints.md)).
        projection_constraint: Constraint function applied to
            the `projection_kernel` weights matrix
            (see [constraints](../constraints.md)).
        bias_constraint: Constraint function applied to the bias vector
            (see [constraints](../constraints.md)).
        dropout: Float between 0 and 1.
            Fraction of the units to drop for
            the linear transformation of the inputs.
        recurrent_dropout: Float between 0 and 1.
            Fraction of the units to drop for
            the linear transformation of the recurrent state.
        implementation: Implementation mode, either 1 or 2.
            Mode 1 will structure its operations as a larger number of
            smaller dot products and additions, whereas mode 2 will
            batch them into fewer, larger operations. These modes will
            have different performance profiles on different hardware and
            for different applications.
    """

    def __init__(self, units,
                 projection_units=None,
                 activation='tanh',
                 recurrent_activation='sigmoid',
                 projection_activation='linear',
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 projection_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 unit_forget_bias=False,
                 kernel_regularizer=None,
                 recurrent_regularizer=None,
                 projection_regularizer=None,
                 bias_regularizer=None,
                 kernel_constraint=None,
                 recurrent_constraint=None,
                 projection_constraint=None,
                 bias_constraint=None,
                 dropout=0.,
                 recurrent_dropout=0.,
                 implementation=2,
                 **kwargs):
        super(NASCell, self).__init__(**kwargs)
        self.units = units
        self.projection_units = projection_units
        self.activation = activations.get(activation)
        self.recurrent_activation = activations.get(recurrent_activation)
        self.projection_activation = activations.get(projection_activation)
        self.cell_activation = activations.get('relu')
        self.use_bias = use_bias

        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.projection_initializer = initializers.get(projection_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.unit_forget_bias = unit_forget_bias

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
        self.projection_regularizer = regularizers.get(projection_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.recurrent_constraint = constraints.get(recurrent_constraint)
        self.projection_constraint = constraints.get(projection_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        self.dropout = min(1., max(0., dropout))
        self.recurrent_dropout = min(1., max(0., recurrent_dropout))
        self.implementation = implementation

        if self.projection_units is not None:
            self.state_size = (self.projection_units, self.units)
        else:
            self.state_size = (self.units, self.units)

        self._dropout_mask = None
        self._recurrent_dropout_mask = None

    def build(self, input_shape):
        input_dim = input_shape[-1]

        if self.projection_units is not None:
            recurrent_output_dim = self.projection_units
        else:
            recurrent_output_dim = self.units

        self.kernel = self.add_weight(shape=(input_dim, self.units * 8),
                                      name='kernel',
                                      initializer=self.kernel_initializer,
                                      regularizer=self.kernel_regularizer,
                                      constraint=self.kernel_constraint)

        self.recurrent_kernel = self.add_weight(
            shape=(recurrent_output_dim, self.units * 8),
            name='recurrent_kernel',
            initializer=self.recurrent_initializer,
            regularizer=self.recurrent_regularizer,
            constraint=self.recurrent_constraint)

        if self.projection_units is not None:
            self.projection_kernel = self.add_weight(
                shape=(self.units, self.projection_units),
                name='projection_kernel',
                initializer=self.projection_initializer,
                regularizer=self.projection_regularizer,
                constraint=self.projection_constraint)

        if self.use_bias:
            if self.unit_forget_bias:
                def bias_initializer(shape, *args, **kwargs):
                    return K.concatenate([
                        self.bias_initializer((self.units,), *args, **kwargs),
                        initializers.Ones()((self.units,), *args, **kwargs),
                        self.bias_initializer((self.units * 6,), *args, **kwargs),
                    ])
            else:
                bias_initializer = self.bias_initializer
            self.bias = self.add_weight(shape=(self.units * 8,),
                                        name='bias',
                                        initializer=bias_initializer,
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)
        else:
            self.bias = None

        self.kernel_0 = self.kernel[:, :self.units]
        self.kernel_1 = self.kernel[:, self.units: self.units * 2]
        self.kernel_2 = self.kernel[:, self.units * 2: self.units * 3]
        self.kernel_3 = self.kernel[:, self.units * 3: self.units * 4]
        self.kernel_4 = self.kernel[:, self.units * 4: self.units * 5]
        self.kernel_5 = self.kernel[:, self.units * 5: self.units * 6]
        self.kernel_6 = self.kernel[:, self.units * 6: self.units * 7]
        self.kernel_7 = self.kernel[:, self.units * 7:]

        self.recurrent_kernel_0 = self.recurrent_kernel[:, :self.units]
        self.recurrent_kernel_1 = self.recurrent_kernel[:, self.units: self.units * 2]
        self.recurrent_kernel_2 = self.recurrent_kernel[:, self.units * 2: self.units * 3]
        self.recurrent_kernel_3 = self.recurrent_kernel[:, self.units * 3: self.units * 4]
        self.recurrent_kernel_4 = self.recurrent_kernel[:, self.units * 4: self.units * 5]
        self.recurrent_kernel_5 = self.recurrent_kernel[:, self.units * 5: self.units * 6]
        self.recurrent_kernel_6 = self.recurrent_kernel[:, self.units * 6: self.units * 7]
        self.recurrent_kernel_7 = self.recurrent_kernel[:, self.units * 7:]

        if self.use_bias:
            self.bias_0 = self.bias[:self.units]
            self.bias_1 = self.bias[self.units: self.units * 2]
            self.bias_2 = self.bias[self.units * 2: self.units * 3]
            self.bias_3 = self.bias[self.units * 3: self.units * 4]
            self.bias_4 = self.bias[self.units * 4: self.units * 5]
            self.bias_5 = self.bias[self.units * 5: self.units * 6]
            self.bias_6 = self.bias[self.units * 6: self.units * 7]
            self.bias_7 = self.bias[self.units * 7:]
        else:
            self.bias_0 = None
            self.bias_1 = None
            self.bias_2 = None
            self.bias_3 = None
            self.bias_4 = None
            self.bias_5 = None
            self.bias_6 = None
            self.bias_7 = None
        self.built = True

    def call(self, inputs, states, training=None):
        if 0 < self.dropout < 1 and self._dropout_mask is None:
            self._dropout_mask = _generate_dropout_mask(
                _generate_dropout_ones(inputs, K.shape(inputs)[-1]),
                self.dropout,
                training=training,
                count=8)
        if (0 < self.recurrent_dropout < 1 and
                self._recurrent_dropout_mask is None):
            _recurrent_dropout_mask = _generate_dropout_mask(
                _generate_dropout_ones(inputs, self.units),
                self.recurrent_dropout,
                training=training,
                count=8)
            self._recurrent_dropout_mask = _recurrent_dropout_mask

        # dropout matrices for input units
        dp_mask = self._dropout_mask
        # dropout matrices for recurrent units
        rec_dp_mask = self._recurrent_dropout_mask

        h_tm1 = states[0]  # previous memory state
        c_tm1 = states[1]  # previous carry state

        if self.implementation == 1:
            if 0 < self.dropout < 1.:
                inputs_0 = inputs * dp_mask[0]
                inputs_1 = inputs * dp_mask[1]
                inputs_2 = inputs * dp_mask[2]
                inputs_3 = inputs * dp_mask[3]
                inputs_4 = inputs * dp_mask[4]
                inputs_5 = inputs * dp_mask[5]
                inputs_6 = inputs * dp_mask[6]
                inputs_7 = inputs * dp_mask[7]
            else:
                inputs_0 = inputs
                inputs_1 = inputs
                inputs_2 = inputs
                inputs_3 = inputs
                inputs_4 = inputs
                inputs_5 = inputs
                inputs_6 = inputs
                inputs_7 = inputs

            x_0 = K.dot(inputs_0, self.kernel_0)
            x_1 = K.dot(inputs_1, self.kernel_1)
            x_2 = K.dot(inputs_2, self.kernel_2)
            x_3 = K.dot(inputs_3, self.kernel_3)
            x_4 = K.dot(inputs_4, self.kernel_4)
            x_5 = K.dot(inputs_5, self.kernel_5)
            x_6 = K.dot(inputs_6, self.kernel_6)
            x_7 = K.dot(inputs_7, self.kernel_7)

            if self.use_bias:
                x_0 = K.bias_add(x_0, self.bias_0)
                x_1 = K.bias_add(x_1, self.bias_1)
                x_2 = K.bias_add(x_2, self.bias_2)
                x_3 = K.bias_add(x_3, self.bias_3)
                x_4 = K.bias_add(x_4, self.bias_4)
                x_5 = K.bias_add(x_5, self.bias_5)
                x_6 = K.bias_add(x_6, self.bias_6)
                x_7 = K.bias_add(x_7, self.bias_7)

            if 0 < self.recurrent_dropout < 1.:
                h_tm1_0 = h_tm1 * rec_dp_mask[0]
                h_tm1_1 = h_tm1 * rec_dp_mask[1]
                h_tm1_2 = h_tm1 * rec_dp_mask[2]
                h_tm1_3 = h_tm1 * rec_dp_mask[3]
                h_tm1_4 = h_tm1 * rec_dp_mask[4]
                h_tm1_5 = h_tm1 * rec_dp_mask[5]
                h_tm1_6 = h_tm1 * rec_dp_mask[6]
                h_tm1_7 = h_tm1 * rec_dp_mask[7]
            else:
                h_tm1_0 = h_tm1
                h_tm1_1 = h_tm1
                h_tm1_2 = h_tm1
                h_tm1_3 = h_tm1
                h_tm1_4 = h_tm1
                h_tm1_5 = h_tm1
                h_tm1_6 = h_tm1
                h_tm1_7 = h_tm1

            # First Layer
            layer1_0 = self.recurrent_activation(x_0 + K.dot(h_tm1_0, self.recurrent_kernel_0))
            layer1_1 = self.cell_activation(x_1 + K.dot(h_tm1_1, self.recurrent_kernel_1))
            layer1_2 = self.recurrent_activation(x_2 + K.dot(h_tm1_2, self.recurrent_kernel_2))
            layer1_3 = self.cell_activation(x_3 * K.dot(h_tm1_3, self.recurrent_kernel_3))
            layer1_4 = self.activation(x_4 + K.dot(h_tm1_4, self.recurrent_kernel_4))
            layer1_5 = self.recurrent_activation(x_5 + K.dot(h_tm1_5, self.recurrent_kernel_5))
            layer1_6 = self.activation(x_6 + K.dot(h_tm1_6, self.recurrent_kernel_6))
            layer1_7 = self.recurrent_activation(x_7 + K.dot(h_tm1_7, self.recurrent_kernel_7))

            # Second Layer
            layer2_0 = self.activation(layer1_0 * layer1_1)
            layer2_1 = self.activation(layer1_2 + layer1_3)
            layer2_2 = self.activation(layer1_4 * layer1_5)
            layer2_3 = self.recurrent_activation(layer1_6 + layer1_7)

            # Inject the Cell
            layer2_0 = self.activation(layer2_0 + c_tm1)

            # Third Layer
            layer3_0_pre = layer2_0 * layer2_1
            c = layer3_0_pre  # create a new cell
            layer3_0 = layer3_0_pre
            layer3_1 = self.activation(layer2_2 + layer2_3)

            # Final Layer
            h = self.activation(layer3_0 * layer3_1)

            if self.projection_units is not None:
                h = self.projection_activation(K.dot(h, self.projection_kernel))

        else:
            if 0. < self.dropout < 1.:
                inputs *= dp_mask[0]
            z = K.dot(inputs, self.kernel)
            if 0. < self.recurrent_dropout < 1.:
                h_tm1 *= rec_dp_mask[0]
            zr = K.dot(h_tm1, self.recurrent_kernel)
            if self.use_bias:
                zr = K.bias_add(zr, self.bias)

            z0 = z[:, :self.units]
            z1 = z[:, self.units: 2 * self.units]
            z2 = z[:, 2 * self.units: 3 * self.units]
            z3 = z[:, 3 * self.units: 4 * self.units]
            z4 = z[:, 4 * self.units: 5 * self.units]
            z5 = z[:, 5 * self.units: 6 * self.units]
            z6 = z[:, 6 * self.units: 7 * self.units]
            z7 = z[:, 7 * self.units:]

            zr0 = zr[:, :self.units]
            zr1 = zr[:, self.units: 2 * self.units]
            zr2 = zr[:, 2 * self.units: 3 * self.units]
            zr3 = zr[:, 3 * self.units: 4 * self.units]
            zr4 = zr[:, 4 * self.units: 5 * self.units]
            zr5 = zr[:, 5 * self.units: 6 * self.units]
            zr6 = zr[:, 6 * self.units: 7 * self.units]
            zr7 = zr[:, 7 * self.units:]

            # First Layer
            layer1_0 = self.recurrent_activation(z0 + zr0)
            layer1_1 = self.cell_activation(z1 + zr1)
            layer1_2 = self.recurrent_activation(z2 + zr2)
            layer1_3 = self.cell_activation(z3 * zr3)
            layer1_4 = self.activation(z4 + zr4)
            layer1_5 = self.recurrent_activation(z5 + zr5)
            layer1_6 = self.activation(z6 + zr6)
            layer1_7 = self.recurrent_activation(z7 + zr7)

            # Second Layer
            layer2_0 = self.activation(layer1_0 * layer1_1)
            layer2_1 = self.activation(layer1_2 + layer1_3)
            layer2_2 = self.activation(layer1_4 * layer1_5)
            layer2_3 = self.recurrent_activation(layer1_6 + layer1_7)

            # Inject the Cell
            layer2_0 = self.activation(layer2_0 + c_tm1)

            # Third Layer
            layer3_0_pre = layer2_0 * layer2_1
            c = layer3_0_pre
            layer3_0 = layer3_0_pre
            layer3_1 = self.activation(layer2_2 + layer2_3)

            # Final Layer
            h = self.activation(layer3_0 * layer3_1)

            if self.projection_units is not None:
                h = self.projection_activation(K.dot(h, self.projection_kernel))

        if 0 < self.dropout + self.recurrent_dropout:
            if training is None:
                h._uses_learning_phase = True
        return h, [h, c]

    def get_config(self):
        config = {'units': self.units,
                  'projection_units': self.projection_units,
                  'activation': activations.serialize(self.activation),
                  'recurrent_activation': activations.serialize(self.recurrent_activation),
                  'projection_activation': activations.serialize(self.projection_activation),
                  'use_bias': self.use_bias,
                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
                  'projection_initializer': initializers.serialize(self.projection_initializer),
                  'bias_initializer': initializers.serialize(self.bias_initializer),
                  'unit_forget_bias': self.unit_forget_bias,
                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
                  'projection_regularizer': regularizers.serialize(self.projection_regularizer),
                  'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                  'kernel_constraint': constraints.serialize(self.kernel_constraint),
                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
                  'projection_constraint': constraints.serialize(self.projection_constraint),
                  'bias_constraint': constraints.serialize(self.bias_constraint),
                  'dropout': self.dropout,
                  'recurrent_dropout': self.recurrent_dropout,
                  'implementation': self.implementation}
        base_config = super(NASCell, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


class NASRNN(RNN):
    """Neural Architecture Search (NAS) recurrent network cell.

    This implements the recurrent cell from the paper:
    https://arxiv.org/abs/1611.01578
    Barret Zoph and Quoc V. Le.
    "Neural Architecture Search with Reinforcement Learning" Proc. ICLR 2017.

    The class uses an optional projection layer.

    # Arguments
        units: Positive integer, dimensionality of the output space.
        projection_units: (optional) Positive integer, The output dimensionality
            for the projection matrices.  If None, no projection is performed.
        activation: Activation function to use
            (see [activations](../activations.md)).
            If you pass None, no activation is applied
            (ie. "linear" activation: `a(x) = x`).
        recurrent_activation: Activation function to use
            for the recurrent step
            (see [activations](../activations.md)).
        projection_activation: Activation function to use
            for the projection step
            (see [activations](../activations.md)).
        use_bias: Boolean, whether the layer uses a bias vector.
        kernel_initializer: Initializer for the `kernel` weights matrix,
            used for the linear transformation of the inputs.
            (see [initializers](../initializers.md)).
        recurrent_initializer: Initializer for the `recurrent_kernel`
            weights matrix,
            used for the linear transformation of the recurrent state.
            (see [initializers](../initializers.md)).
        projection_initializer: Initializer for the `projection_kernel`
            weights matrix,
            used for the linear transformation of the projection step.
            (see [initializers](../initializers.md)).
        bias_initializer: Initializer for the bias vector
            (see [initializers](../initializers.md)).
        unit_forget_bias: Boolean.
            If True, add 1 to the bias of the forget gate at initialization.
            Setting it to true will also force `bias_initializer="zeros"`.
            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
        kernel_regularizer: Regularizer function applied to
            the `kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        recurrent_regularizer: Regularizer function applied to
            the `recurrent_kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        projection_regularizer: Regularizer function applied to
            the `projection_kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        bias_regularizer: Regularizer function applied to the bias vector
            (see [regularizer](../regularizers.md)).
        kernel_constraint: Constraint function applied to
            the `kernel` weights matrix
            (see [constraints](../constraints.md)).
        recurrent_constraint: Constraint function applied to
            the `recurrent_kernel` weights matrix
            (see [constraints](../constraints.md)).
        projection_constraint: Constraint function applied to
            the `projection_kernel` weights matrix
            (see [constraints](../constraints.md)).
        bias_constraint: Constraint function applied to the bias vector
            (see [constraints](../constraints.md)).
        dropout: Float between 0 and 1.
            Fraction of the units to drop for
            the linear transformation of the inputs.
        recurrent_dropout: Float between 0 and 1.
            Fraction of the units to drop for
            the linear transformation of the recurrent state.
        implementation: Implementation mode, either 1 or 2.
            Mode 1 will structure its operations as a larger number of
            smaller dot products and additions, whereas mode 2 will
            batch them into fewer, larger operations. These modes will
            have different performance profiles on different hardware and
            for different applications.
        return_sequences: Boolean. Whether to return the last output.
            in the output sequence, or the full sequence.
        return_state: Boolean. Whether to return the last state
            in addition to the output.
        go_backwards: Boolean (default False).
            If True, process the input sequence backwards and return the
            reversed sequence.
        stateful: Boolean (default False). If True, the last state
            for each sample at index i in a batch will be used as initial
            state for the sample of index i in the following batch.
        unroll: Boolean (default False).
            If True, the network will be unrolled,
            else a symbolic loop will be used.
            Unrolling can speed-up a RNN,
            although it tends to be more memory-intensive.
            Unrolling is only suitable for short sequences.

    # References
        - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf) (original 1997 paper)
        - [Learning to forget: Continual prediction with NestedLSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
        - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
        - [Nested LSTMs](https://arxiv.org/abs/1801.10308)
    """

    def __init__(self, units,
                 projection_units=None,
                 activation='tanh',
                 recurrent_activation='sigmoid',
                 projection_activation='linear',
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 projection_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 unit_forget_bias=False,
                 kernel_regularizer=None,
                 recurrent_regularizer=None,
                 projection_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 recurrent_constraint=None,
                 projection_constraint=None,
                 bias_constraint=None,
                 dropout=0.,
                 recurrent_dropout=0.,
                 implementation=2,
                 return_sequences=False,
                 return_state=False,
                 go_backwards=False,
                 stateful=False,
                 unroll=False,
                 **kwargs):
        if implementation == 0:
            warnings.warn('`implementation=0` has been deprecated, '
                          'and now defaults to `implementation=2`.'
                          'Please update your layer call.')
        if K.backend() == 'theano':
            warnings.warn(
                'RNN dropout is no longer supported with the Theano backend '
                'due to technical limitations. '
                'You can either set `dropout` and `recurrent_dropout` to 0, '
                'or use the TensorFlow backend.')
            dropout = 0.
            recurrent_dropout = 0.

        cell = NASCell(units, projection_units,
                       activation=activation,
                       recurrent_activation=recurrent_activation,
                       projection_activation=projection_activation,
                       use_bias=use_bias,
                       kernel_initializer=kernel_initializer,
                       recurrent_initializer=recurrent_initializer,
                       projection_initializer=projection_initializer,
                       unit_forget_bias=unit_forget_bias,
                       bias_initializer=bias_initializer,
                       kernel_regularizer=kernel_regularizer,
                       recurrent_regularizer=recurrent_regularizer,
                       bias_regularizer=bias_regularizer,
                       projection_regularizer=projection_regularizer,
                       kernel_constraint=kernel_constraint,
                       recurrent_constraint=recurrent_constraint,
                       bias_constraint=bias_constraint,
                       projection_constraint=projection_constraint,
                       dropout=dropout,
                       recurrent_dropout=recurrent_dropout,
                       implementation=implementation)
        super(NASRNN, self).__init__(cell,
                                     return_sequences=return_sequences,
                                     return_state=return_state,
                                     go_backwards=go_backwards,
                                     stateful=stateful,
                                     unroll=unroll,
                                     **kwargs)
        self.activity_regularizer = regularizers.get(activity_regularizer)

    def call(self, inputs, mask=None, training=None, initial_state=None, constants=None):
        self.cell._dropout_mask = None
        self.cell._recurrent_dropout_mask = None
        return super(NASRNN, self).call(inputs,
                                        mask=mask,
                                        training=training,
                                        initial_state=initial_state,
                                        constants=constants)

    @property
    def units(self):
        return self.cell.units

    @property
    def projection_units(self):
        return self.cell.projection_units

    @property
    def activation(self):
        return self.cell.activation

    @property
    def recurrent_activation(self):
        return self.cell.recurrent_activation

    @property
    def projection_activation(self):
        return self.cell.projection_activation

    @property
    def use_bias(self):
        return self.cell.use_bias

    @property
    def kernel_initializer(self):
        return self.cell.kernel_initializer

    @property
    def recurrent_initializer(self):
        return self.cell.recurrent_initializer

    @property
    def bias_initializer(self):
        return self.cell.bias_initializer

    @property
    def projection_initializer(self):
        return self.cell.projection_initializer

    @property
    def unit_forget_bias(self):
        return self.cell.unit_forget_bias

    @property
    def kernel_regularizer(self):
        return self.cell.kernel_regularizer

    @property
    def recurrent_regularizer(self):
        return self.cell.recurrent_regularizer

    @property
    def bias_regularizer(self):
        return self.cell.bias_regularizer

    @property
    def projection_regularizer(self):
        return self.cell.projection_regularizer

    @property
    def kernel_constraint(self):
        return self.cell.kernel_constraint

    @property
    def recurrent_constraint(self):
        return self.cell.recurrent_constraint

    @property
    def bias_constraint(self):
        return self.cell.bias_constraint

    @property
    def projection_constraint(self):
        return self.cell.projection_constraint

    @property
    def dropout(self):
        return self.cell.dropout

    @property
    def recurrent_dropout(self):
        return self.cell.recurrent_dropout

    @property
    def implementation(self):
        return self.cell.implementation

    def get_config(self):
        config = {'units': self.units,
                  'projection_units': self.projection_units,
                  'activation': activations.serialize(self.activation),
                  'recurrent_activation': activations.serialize(self.recurrent_activation),
                  'projection_activation': activations.serialize(self.projection_activation),
                  'use_bias': self.use_bias,
                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
                  'bias_initializer': initializers.serialize(self.bias_initializer),
                  'projection_initializer': initializers.serialize(self.projection_initializer),
                  'unit_forget_bias': self.unit_forget_bias,
                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
                  'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                  'projection_regularizer': regularizers.serialize(self.projection_regularizer),
                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
                  'kernel_constraint': constraints.serialize(self.kernel_constraint),
                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
                  'bias_constraint': constraints.serialize(self.bias_constraint),
                  'projection_constraint': constraints.serialize(self.projection_constraint),
                  'dropout': self.dropout,
                  'recurrent_dropout': self.recurrent_dropout,
                  'implementation': self.implementation}
        base_config = super(NASRNN, self).get_config()
        del base_config['cell']
        return dict(list(base_config.items()) + list(config.items()))

    @classmethod
    def from_config(cls, config):
        if 'implementation' in config and config['implementation'] == 0:
            config['implementation'] = 2
        return cls(**config)

你可能感兴趣的:(DRL,tensorflow,人工智能,python)