接触MAML半年了,在把这个传说中的神奇框架给改得不成样子之后,又回到了原点。从完全没接触过tensorflow和未知深度学习神经网络为何物之时到倒腾实验半年的经历,让我渐渐领会到MAML的一些思想。学习之路坎坷,暂且以简单笔记记之。以下从数据集、代码、网络结构三个层次作了介绍。
就从跟我实验有关的miniImagenet(下载)的5分类说起。这个数据集下载下来是一个100×600的数据集,即含有100类花鸟虫自然与生活中物品的集合,每类有600张图片。这些样本也就是类似这样的:
这里列举的样本大小是统一的,原始样本大小不一定一样大,但是数据经过分类(根据图片名称已标注类别),同时将大小统一至84×84,并划分为train,val,test数据集。
接下来就是代码中处理的一些方式。
第一部分 抽取样本作为训练和验证数据
def make_data_tensor(self, train=True):
if train:
#train目录下的所有分好类的样本的文件目录
folders = self.metatrain_character_folders
# number of tasks, not number of meta-iterations. (divide by metabatch size to measure)
#放进队列样本的批次数,验证数据批次有所不同。因miniImagenet训练验证数据量差别。 train有64类,val有16类,test有20类。
num_total_batches = 20000 #200000
else:
folders = self.metaval_character_folders
num_total_batches = 600
# make list of files
# print('Generating filenames:',folders)
print('Generating filenames')
all_filenames = []
all_labels = []
#从这里开始循环取出所有批次样本,每个批次样本类别随机(tain即64选5依次类推)
for _ in range(num_total_batches):
#随机选择num_classes类
sampled_character_folders = random.sample(folders, self.num_classes)
#打乱类别顺序
random.shuffle(sampled_character_folders)
#这里取出的样本实际是num_classes类,每类有self.num_samples_per_class个样本,最终是顺序按类连续排列的样本路径的列表
labels_and_images = get_images(sampled_character_folders, range(self.num_classes), nb_samples=self.num_samples_per_class, shuffle=False)
# make sure the above isn't randomized order
#样本路径列表
filenames = [li[1] for li in labels_and_images]
#样本标签列表,由于每个批次所取样本对应的标签都是一样的,所以后面队列中每次取出样本的标签只需要用到lables而不必用到all_labels(自行添加)
labels = [li[0] for li in labels_and_images]
all_filenames.extend(filenames)
all_labels.extend(filenames)
# make queue for tensorflow to read from
#这就是存放样本路径的队列,也就是说先把所有样本对应路径放进队列,只有在训练开始的时候才会根据队列去读出样本数据,这也是tensorflow图计算机制,先构建结点,等到需要计算某结点时才会依次根据结点间关系去一一计算出结果。
filename_queue = tf.train.string_input_producer(tf.convert_to_tensor(all_filenames), shuffle=False)
print('Generating image processing ops',len(list(set(all_filenames))))
image_reader = tf.WholeFileReader()
_, image_file = image_reader.read(filename_queue)
if FLAGS.datasource == 'miniimagenet':
#对于每个读出的样本所做的处理,解码、向量化(行向量)、归一化
image = tf.image.decode_jpeg(image_file, channels=3)
image.set_shape((self.img_size[0],self.img_size[1],3))
image = tf.reshape(image, [self.dim_input])
image = tf.cast(image, tf.float32) / 255.0
else:
image = tf.image.decode_png(image_file)
image.set_shape((self.img_size[0],self.img_size[1],1))
image = tf.reshape(image, [self.dim_input])
image = tf.cast(image, tf.float32) / 255.0
image = 1.0 - image # invert
num_preprocess_threads = 1 # TODO - enable this to be set to >1,可启用多线程
min_queue_examples = 256
#每个批次包含的样本数=类别数×每类样本数
examples_per_batch = self.num_classes * self.num_samples_per_class
#训练时需要取出的样本数=批次数×每批次的样本数
batch_image_size = self.batch_size * examples_per_batch
print('Batching images')
#队列中不断读出的数据还是一个个向量,而这里是需要告诉队列中读出的样本需要打包成这样:[数据]大小是batch_image_size
images = tf.train.batch(
[image],
batch_size = batch_image_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + 3 * batch_image_size,
)
all_image_batches, all_label_batches = [], []
print('Manipulating image data to be right shape')
#这里开始对数据进行交错处理。也就是说队列中放的其实是每个类连续的样本,这里需将顺序调整。
for i in range(self.batch_size):
#依次取出每批次的样本
image_batch = images[i*examples_per_batch:(i+1)*examples_per_batch]
if FLAGS.datasource == 'omniglot':
# omniglot augments the dataset by rotating digits to create new classes
# get rotation per class (e.g. 0,1,2,0,0 if there are 5 classes)
rotations = tf.multinomial(tf.log([[0.,1.,2.,0.,0.]]), self.num_classes)
#对应给出每批次对应的样本标签
label_batch = tf.convert_to_tensor(labels)
new_list, new_label_list = [], []
#根据每批次每类的样本数进行顺序调整
for k in range(self.num_samples_per_class):
#根据分类数,eg.5分类,那么类别依次标记为0~4
class_idxs = tf.range(0, self.num_classes)
#类别顺序随机打乱
class_idxs = tf.random_shuffle(class_idxs)
#算出打乱后的索引列表,eg. [0,3,2,1,4]*16+1=[1,49,33,17,65]
true_idxs = class_idxs*self.num_samples_per_class + k
#从image_batch,取出的这批次中取出索引列表对应索引的样本,追加入new_list。也就是说相当于每类都取出一个样本,样本顺序是随机排列,直至每类的样本都取完,以此实现num_classes类样本的交替。当然标签打乱的顺序是同步的
new_list.append(tf.gather(image_batch,true_idxs))
if FLAGS.datasource == 'omniglot': # and FLAGS.train:
new_list[-1] = tf.stack([tf.reshape(tf.image.rot90(
tf.reshape(new_list[-1][ind], [self.img_size[0],self.img_size[1],1]),
k=tf.cast(rotations[0,class_idxs[ind]], tf.int32)), (self.dim_input,))
for ind in range(self.num_classes)])
new_label_list.append(tf.gather(label_batch, true_idxs))
#将每批次的经过顺序调整的样本拼接成一个列表,这里0表示垂直拼接(维度没变)
new_list = tf.concat(new_list, 0) # has shape [self.num_classes*self.num_samples_per_class, self.dim_input]
new_label_list = tf.concat(new_label_list, 0)
all_image_batches.append(new_list)
all_label_batches.append(new_label_list)
#样本按批次作第三维拼接(叠放),二维变三维
all_image_batches = tf.stack(all_image_batches)
all_label_batches = tf.stack(all_label_batches)
#标签采用one_hot编码,对应类位置标为1,其余位置标为0,eg. [0,0,1,0,0]即相当于类2
all_label_batches = tf.one_hot(all_label_batches, self.num_classes)
return all_image_batches, all_label_batches
紧接着将数据分a,b组,该模型本意便是要将a组学习的经验应用到b组,训练阶段会解释这一段代码。
if FLAGS.datasource == 'miniimagenet' or FLAGS.datasource == 'omniglot':
tf_data_load = True
num_classes = data_generator.num_classes
if FLAGS.train: # only construct training model if needed
random.seed(5)
#这个方法便是前面解释的那段代码。接下来就是对队列中读出的样本进行分组(
#注意实际上读到这里时都是在构建计算图,还没有开始运行结点上相关计算)。
image_tensor, label_tensor = data_generator.make_data_tensor()
#slice(image_tensor, [?,?,?], [?,?,?])该方法将数据(第一个参数)中的
#某些数据取出,起始位置由第二个参数决定,数据大小由第三个参数决定。这里这行表示将
#image_tensor中从[0,0,0]开始,大小为[-1]批即所有批次,
#num_classes*FLAGS.update_batch_size行(样本数),[-1]列,每个样本每一列取尽。以下类推。
inputa = tf.slice(image_tensor, [0,0,0], [-1,num_classes*FLAGS.update_batch_size, -1])
inputb = tf.slice(image_tensor, [0,num_classes*FLAGS.update_batch_size, 0], [-1,-1,-1])
labela = tf.slice(label_tensor, [0,0,0], [-1,num_classes*FLAGS.update_batch_size, -1])
labelb = tf.slice(label_tensor, [0,num_classes*FLAGS.update_batch_size, 0], [-1,-1,-1])
input_tensors = {'inputa': inputa, 'inputb': inputb, 'labela': labela, 'labelb': labelb}
random.seed(6)
image_tensor, label_tensor = data_generator.make_data_tensor(train=False)
# pdb.set_trace()
inputa = tf.slice(image_tensor, [0,0,0], [-1,num_classes*FLAGS.update_batch_size, -1])
inputb = tf.slice(image_tensor, [0,num_classes*FLAGS.update_batch_size, 0], [-1,-1,-1])
labela = tf.slice(label_tensor, [0,0,0], [-1,num_classes*FLAGS.update_batch_size, -1])
labelb = tf.slice(label_tensor, [0,num_classes*FLAGS.update_batch_size, 0], [-1,-1,-1])
metaval_input_tensors = {'inputa': inputa, 'inputb': inputb, 'labela': labela, 'labelb': labelb}
第二部分 训练阶段
MAML本身包含了很多种深度学习方向,从分类到回归,这些在原文中都有介绍.正因为此,在很多地方所用到的分支比较多。本文主要注释的是针对miniImagenet中5分类情形,其余其余情形处理方式会有所不同。训练阶段从构建模型开始:
#初始化模型
model = MAML(dim_input, dim_output, test_num_updates=test_num_updates)
#构建模型
if FLAGS.train or not tf_data_load:
model.construct_model(input_tensors=input_tensors, prefix='metatrain_')
if tf_data_load:
model.construct_model(input_tensors=metaval_input_tensors, prefix='metaval_')
构建模型包括训练实现方式如下:
def construct_model(self, input_tensors=None, prefix='metatrain_'):
# a: training data for inner gradient, b: test data for meta gradient
#初始化输入数据
if input_tensors is None:
self.inputa = tf.placeholder(tf.float32)
self.inputb = tf.placeholder(tf.float32)
self.labela = tf.placeholder(tf.float32)
self.labelb = tf.placeholder(tf.float32)
else:
self.inputa = input_tensors['inputa']
self.inputb = input_tensors['inputb']
self.labela = input_tensors['labela']
self.labelb = input_tensors['labelb']
#判断模型权重是否已存在,未存在则初始化
with tf.variable_scope('model', reuse=None) as training_scope:
if 'weights' in dir(self):
training_scope.reuse_variables()
weights = self.weights
else:
# Define the weights
self.weights = weights = self.construct_weights()
# outputbs[i] and lossesb[i] is the output and loss after i+1 gradient updates
lossesa, outputas, lossesb, outputbs = [], [], [], []
accuraciesa, accuraciesb = [], []
num_updates = max(self.test_num_updates, FLAGS.num_updates)
outputbs = [[]]*num_updates
lossesb = [[]]*num_updates
accuraciesb = [[]]*num_updates
#元学习的方法
def task_metalearn(inp, reuse=True):
""" Perform gradient descent for one task in the meta-batch. """
inputa, inputb, labela, labelb = inp
task_outputbs, task_lossesb = [], []
if self.classification:
task_accuraciesb = []
#前向传播,后面将注释网络结构和前向传播方法
task_outputa = self.forward(inputa, weights, reuse=reuse) # only reuse on the first iter
#计算a组损失
task_lossa = self.loss_func(task_outputa, labela)
#计算梯度
grads = tf.gradients(task_lossa, list(weights.values()))
if FLAGS.stop_grad:
grads = [tf.stop_gradient(grad) for grad in grads]
gradients = dict(zip(weights.keys(), grads))
#临时更新权重
fast_weights = dict(zip(weights.keys(), [weights[key] - self.update_lr*gradients[key] for key in weights.keys()]))
#b组数据前向传播
output = self.forward(inputb, fast_weights, reuse=True)
task_outputbs.append(output)
task_lossesb.append(self.loss_func(output, labelb))
for j in range(num_updates - 1):
#继续将临时权重用于a组学习,学习经验传播给b组,b组更新临时权重,如此迭代num_updates - 1次
loss = self.loss_func(self.forward(inputa, fast_weights, reuse=True), labela)
grads = tf.gradients(loss, list(fast_weights.values()))
if FLAGS.stop_grad:
grads = [tf.stop_gradient(grad) for grad in grads]
gradients = dict(zip(fast_weights.keys(), grads))
fast_weights = dict(zip(fast_weights.keys(), [fast_weights[key] - self.update_lr*gradients[key] for key in fast_weights.keys()]))
output = self.forward(inputb, fast_weights, reuse=True)
task_outputbs.append(output)
task_lossesb.append(self.loss_func(output, labelb))
#打包神经网络输出,损失进返回列表
task_output = [task_outputa, task_outputbs, task_lossa, task_lossesb]
#计算准确率
if self.classification:
task_accuracya = tf.contrib.metrics.accuracy(tf.argmax(tf.nn.softmax(task_outputa), 1), tf.argmax(labela, 1))
for j in range(num_updates):
task_accuraciesb.append(tf.contrib.metrics.accuracy(tf.argmax(tf.nn.softmax(task_outputbs[j]), 1), tf.argmax(labelb, 1)))
task_output.extend([task_accuracya, task_accuraciesb])
return task_output
if FLAGS.norm is not 'None':
# to initialize the batch norm vars, might want to combine this, and not run idx 0 twice.
unused = task_metalearn((self.inputa[0], self.inputb[0], self.labela[0], self.labelb[0]), False)
#指定tf.map_fn返回值类型
out_dtype = [tf.float32, [tf.float32]*num_updates, tf.float32, [tf.float32]*num_updates]
if self.classification:
out_dtype.extend([tf.float32, [tf.float32]*num_updates])
#将输入数据按批次并行传入task_metalearn进行学习,即小单元学习
result = tf.map_fn(task_metalearn, elems=(self.inputa, self.inputb, self.labela, self.labelb), dtype=out_dtype, parallel_iterations=FLAGS.meta_batch_size)
if self.classification:
outputas, outputbs, lossesa, lossesb, accuraciesa, accuraciesb = result
else:
outputas, outputbs, lossesa, lossesb = result
#将重要的输出作为模型参数,在训练时作为计算结点,便于参与计算
## Performance & Optimization
if 'train' in prefix:
self.total_loss1 = total_loss1 = tf.reduce_sum(lossesa) / tf.to_float(FLAGS.meta_batch_size)
self.total_losses2 = total_losses2 = [tf.reduce_sum(lossesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(num_updates)]
# after the map_fn
self.outputas, self.outputbs = outputas, outputbs
if self.classification:
self.total_accuracy1 = total_accuracy1 = tf.reduce_sum(accuraciesa) / tf.to_float(FLAGS.meta_batch_size)
self.total_accuracies2 = total_accuracies2 = [tf.reduce_sum(accuraciesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(num_updates)]
#优化器,权重进行更新
self.pretrain_op = tf.train.AdamOptimizer(self.meta_lr).minimize(total_loss1)
if FLAGS.metatrain_iterations > 0:
optimizer = tf.train.AdamOptimizer(self.meta_lr)
self.gvs = gvs = optimizer.compute_gradients(self.total_losses2[FLAGS.num_updates-1])
if FLAGS.datasource == 'miniimagenet':
gvs = [(tf.clip_by_value(grad, -10, 10), var) for grad, var in gvs]
self.metatrain_op = optimizer.apply_gradients(gvs)
else:
self.metaval_total_loss1 = total_loss1 = tf.reduce_sum(lossesa) / tf.to_float(FLAGS.meta_batch_size)
self.metaval_total_losses2 = total_losses2 = [tf.reduce_sum(lossesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(num_updates)]
if self.classification:
self.metaval_total_accuracy1 = total_accuracy1 = tf.reduce_sum(accuraciesa) / tf.to_float(FLAGS.meta_batch_size)
self.metaval_total_accuracies2 = total_accuracies2 =[tf.reduce_sum(accuraciesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(num_updates)]
## Summaries 将输出写入日志
tf.summary.scalar(prefix+'Pre-update loss', total_loss1)
if self.classification:
tf.summary.scalar(prefix+'Pre-update accuracy', total_accuracy1)
for j in range(num_updates):
tf.summary.scalar(prefix+'Post-update loss, step ' + str(j+1), total_losses2[j])
if self.classification:
tf.summary.scalar(prefix+'Post-update accuracy, step ' + str(j+1), total_accuracies2[j])
接下来就把目光放在网络结构上,也就是说上述函数中权重self.weights的由来:
def construct_conv_weights(self):
weights = {}
dtype = tf.float32
conv_initializer = tf.contrib.layers.xavier_initializer_conv2d(dtype=dtype)
fc_initializer = tf.contrib.layers.xavier_initializer(dtype=dtype)
k = 3 #卷积核尺寸3*3
#初始化第一层卷积算子[3,3,3,32],在miniImagenet数据集上默认是这样结构,并且初始化为0
weights['conv1'] = tf.get_variable('conv1', [k, k, self.channels, self.dim_hidden], initializer=conv_initializer, dtype=dtype)
#初始化第一层b偏置为0
weights['b1'] = tf.Variable(tf.zeros([self.dim_hidden]))
#初始化第二层卷积算子[3,3,32,32],以下第三层,第四层是同样的方式
weights['conv2'] = tf.get_variable('conv2', [k, k, self.dim_hidden, self.dim_hidden], initializer=conv_initializer, dtype=dtype)
#初始化第二层b偏置为0
weights['b2'] = tf.Variable(tf.zeros([self.dim_hidden]))
weights['conv3'] = tf.get_variable('conv3', [k, k, self.dim_hidden, self.dim_hidden], initializer=conv_initializer, dtype=dtype)
weights['b3'] = tf.Variable(tf.zeros([self.dim_hidden]))
weights['conv4'] = tf.get_variable('conv4', [k, k, self.dim_hidden, self.dim_hidden], initializer=conv_initializer, dtype=dtype)
weights['b4'] = tf.Variable(tf.zeros([self.dim_hidden]))
if FLAGS.datasource == 'miniimagenet':
# assumes max pooling 第五层用了特定的维度目的是用于最后网络输出的线性化
weights['w5'] = tf.get_variable('w5', [self.dim_hidden*5*5, self.dim_output], initializer=fc_initializer)
weights['b5'] = tf.Variable(tf.zeros([self.dim_output]), name='b5')
else:
weights['w5'] = tf.Variable(tf.random_normal([self.dim_hidden, self.dim_output]), name='w5')
weights['b5'] = tf.Variable(tf.zeros([self.dim_output]), name='b5')
return weights
以及前向传播算法:
def forward_conv(self, inp, weights, reuse=False, scope=''):
# reuse is for the normalization parameters.
channels = self.channels
#将输入的图像的矢量转化为图像原来的维度
inp = tf.reshape(inp, [-1, self.img_size, self.img_size, channels])
#将图像输入到网络第一层,最后返回第一层的输出结果,紧接着放入第二层重复操作
hidden1 = conv_block(inp, weights['conv1'], weights['b1'], reuse, scope+'0')
hidden2 = conv_block(hidden1, weights['conv2'], weights['b2'], reuse, scope+'1')
hidden3 = conv_block(hidden2, weights['conv3'], weights['b3'], reuse, scope+'2')
hidden4 = conv_block(hidden3, weights['conv4'], weights['b4'], reuse, scope+'3')
if FLAGS.datasource == 'miniimagenet':
# last hidden layer is 6x6x64-ish, reshape to a vector (实际本人测试看到的维度是5*5*32)
# 对于第四层即特征提取的最后一层,将输出向量化(转一维)便于最后一层线性化
hidden4 = tf.reshape(hidden4, [-1, np.prod([int(dim) for dim in hidden4.get_shape()[1:]])])
else:
hidden4 = tf.reduce_mean(hidden4, [1, 2])
#返回第五层和第四层输出结果的內积+偏置,即分类预测标签(1*800)*(800*2) =(1*2)
return tf.matmul(hidden4, weights['w5']) + weights['b5']
实际这个网络结构输入输出情形是这样的(分类情形),以下是针对n张84*84的彩色图像进行二分类过程:
整个过程没什么高深莫测的东西,网络结构似乎也没什么清奇突出的地方,但是MAML独特的小样本单元学习使得该模型在分类回归问题中具有较好的泛化能力,使得其解决某些问题有明显的优势。