目标:将vggnet训练cifar-10数据集的精确度从70%提升至85%
现在的vggnet
import tensorflow as tf import os import pickle import numpy as np CIFAR_DIR = "dataset/cifar-10-batches-py" print(os.listdir(CIFAR_DIR)) def load_data(filename): """read data from data file.""" with open(filename, 'rb') as f: data = pickle.load(f, encoding='bytes') return data[b'data'], data[b'labels'] # tensorflow.Dataset. class CifarData: def __init__(self, filenames, need_shuffle): all_data = [] all_labels = [] for filename in filenames: data, labels = load_data(filename) all_data.append(data) all_labels.append(labels) self._data = np.vstack(all_data) self._data = self._data / 127.5 - 1 self._labels = np.hstack(all_labels) print(self._data.shape) print(self._labels.shape) self._num_examples = self._data.shape[0] self._need_shuffle = need_shuffle self._indicator = 0 if self._need_shuffle: self._shuffle_data() def _shuffle_data(self): # [0,1,2,3,4,5] -> [5,3,2,4,0,1] p = np.random.permutation(self._num_examples) self._data = self._data[p] self._labels = self._labels[p] def next_batch(self, batch_size): """return batch_size examples as a batch.""" end_indicator = self._indicator + batch_size if end_indicator > self._num_examples: if self._need_shuffle: self._shuffle_data() self._indicator = 0 end_indicator = batch_size else: raise Exception("have no more examples") if end_indicator > self._num_examples: raise Exception("batch size is larger than all examples") batch_data = self._data[self._indicator: end_indicator] batch_labels = self._labels[self._indicator: end_indicator] self._indicator = end_indicator return batch_data, batch_labels train_filenames = [os.path.join(CIFAR_DIR, 'data_batch_%d' % i) for i in range(1, 6)] test_filenames = [os.path.join(CIFAR_DIR, 'test_batch')] train_data = CifarData(train_filenames, True) test_data = CifarData(test_filenames, False) x = tf.placeholder(tf.float32, [None, 3072]) y = tf.placeholder(tf.int64, [None]) # [None], eg: [0,5,6,3] x_image = tf.reshape(x, [-1, 3, 32, 32]) # 32*32 x_image = tf.transpose(x_image, perm=[0, 2, 3, 1]) # conv1: 神经元图, feature_map, 输出图像 conv1_1 = tf.layers.conv2d(x_image, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv1_1') conv1_2 = tf.layers.conv2d(conv1_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv1_2') # 16 * 16 pooling1 = tf.layers.max_pooling2d(conv1_2, (2, 2), # kernel size (2, 2), # stride name = 'pool1') conv2_1 = tf.layers.conv2d(pooling1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv2_1') conv2_2 = tf.layers.conv2d(conv2_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv2_2') # 8 * 8 pooling2 = tf.layers.max_pooling2d(conv2_2, (2, 2), # kernel size (2, 2), # stride name = 'pool2') conv3_1 = tf.layers.conv2d(pooling2, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_1') conv3_2 = tf.layers.conv2d(conv3_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_2') # 4 * 4 * 32 pooling3 = tf.layers.max_pooling2d(conv3_2, (2, 2), # kernel size (2, 2), # stride name = 'pool3') # [None, 4 * 4 * 32] flatten = tf.layers.flatten(pooling3) y_ = tf.layers.dense(flatten, 10) loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_) # y_ -> sofmax # y -> one_hot # loss = ylogy_ # indices predict = tf.argmax(y_, 1) # [1,0,1,1,1,0,0,0] correct_prediction = tf.equal(predict, y) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float64)) with tf.name_scope('train_op'): train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) init = tf.global_variables_initializer() batch_size = 20 train_steps = 10000 test_steps = 100 # train 10k: 73.4% with tf.Session() as sess: sess.run(init) for i in range(train_steps): batch_data, batch_labels = train_data.next_batch(batch_size) loss_val, acc_val, _ = sess.run( [loss, accuracy, train_op], feed_dict={ x: batch_data, y: batch_labels}) if (i+1) % 100 == 0: print('[Train] Step: %d, loss: %4.5f, acc: %4.5f' % (i+1, loss_val, acc_val)) if (i+1) % 1000 == 0: test_data = CifarData(test_filenames, False) all_test_acc_val = [] for j in range(test_steps): test_batch_data, test_batch_labels \ = test_data.next_batch(batch_size) test_acc_val = sess.run( [accuracy], feed_dict = { x: test_batch_data, y: test_batch_labels }) all_test_acc_val.append(test_acc_val) test_acc = np.mean(all_test_acc_val) print('[Test ] Step: %d, acc: %4.5f' % (i+1, test_acc))
训练10k次,accurancy在70%左右
1、训练更多次数
当训练100k次的时候,准确率会达到78%
2、用tensorboard进行可视化
https://blog.csdn.net/hxxjxw/article/details/109140431
添加后的代码
import tensorflow as tf import os import pickle import numpy as np CIFAR_DIR = "dataset/cifar-10-batches-py" print(os.listdir(CIFAR_DIR)) def load_data(filename): """read data from data file.""" with open(filename, 'rb') as f: data = pickle.load(f, encoding='bytes') return data[b'data'], data[b'labels'] # tensorflow.Dataset. class CifarData: def __init__(self, filenames, need_shuffle): all_data = [] all_labels = [] for filename in filenames: data, labels = load_data(filename) all_data.append(data) all_labels.append(labels) self._data = np.vstack(all_data) self._data = self._data / 127.5 - 1 self._labels = np.hstack(all_labels) print(self._data.shape) print(self._labels.shape) self._num_examples = self._data.shape[0] self._need_shuffle = need_shuffle self._indicator = 0 if self._need_shuffle: self._shuffle_data() def _shuffle_data(self): # [0,1,2,3,4,5] -> [5,3,2,4,0,1] p = np.random.permutation(self._num_examples) self._data = self._data[p] self._labels = self._labels[p] def next_batch(self, batch_size): """return batch_size examples as a batch.""" end_indicator = self._indicator + batch_size if end_indicator > self._num_examples: if self._need_shuffle: self._shuffle_data() self._indicator = 0 end_indicator = batch_size else: raise Exception("have no more examples") if end_indicator > self._num_examples: raise Exception("batch size is larger than all examples") batch_data = self._data[self._indicator: end_indicator] batch_labels = self._labels[self._indicator: end_indicator] self._indicator = end_indicator return batch_data, batch_labels train_filenames = [os.path.join(CIFAR_DIR, 'data_batch_%d' % i) for i in range(1, 6)] test_filenames = [os.path.join(CIFAR_DIR, 'test_batch')] train_data = CifarData(train_filenames, True) test_data = CifarData(test_filenames, False) x = tf.placeholder(tf.float32, [None, 3072]) y = tf.placeholder(tf.int64, [None]) # [None], eg: [0,5,6,3] x_image = tf.reshape(x, [-1, 3, 32, 32]) # 32*32 x_image = tf.transpose(x_image, perm=[0, 2, 3, 1]) # conv1: 神经元图, feature_map, 输出图像 conv1_1 = tf.layers.conv2d(x_image, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv1_1') conv1_2 = tf.layers.conv2d(conv1_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv1_2') # 16 * 16 pooling1 = tf.layers.max_pooling2d(conv1_2, (2, 2), # kernel size (2, 2), # stride name = 'pool1') conv2_1 = tf.layers.conv2d(pooling1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv2_1') conv2_2 = tf.layers.conv2d(conv2_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv2_2') # 8 * 8 pooling2 = tf.layers.max_pooling2d(conv2_2, (2, 2), # kernel size (2, 2), # stride name = 'pool2') conv3_1 = tf.layers.conv2d(pooling2, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_1') conv3_2 = tf.layers.conv2d(conv3_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_2') # 4 * 4 * 32 pooling3 = tf.layers.max_pooling2d(conv3_2, (2, 2), # kernel size (2, 2), # stride name = 'pool3') # [None, 4 * 4 * 32] flatten = tf.layers.flatten(pooling3) y_ = tf.layers.dense(flatten, 10) loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_) # y_ -> sofmax # y -> one_hot # loss = ylogy_ # indices predict = tf.argmax(y_, 1) # [1,0,1,1,1,0,0,0] correct_prediction = tf.equal(predict, y) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float64)) with tf.name_scope('train_op'): train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) #name是指定命名空间,为了防止冲突 def variable_summary(var,name): with tf.name_scope(name): mean = tf.reduce_mean(var)#均值 with tf.name_scope('stdddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var-mean))) tf.summary.scalar('mean',mean) tf.summary.scalar('stddev',stddev) tf.summary.scalar('min',tf.reduce_min(var)) tf.summary.scalar('max',tf.reduce_max(var)) tf.summary.histogram('histogram',var)#直方图 with tf.name_scope('summary'): variable_summary(conv1_1, 'conv1_1') variable_summary(conv1_2, 'conv1_2') variable_summary(conv2_1, 'conv2_1') variable_summary(conv2_2, 'conv2_2') variable_summary(conv3_1, 'conv3_1') variable_summary(conv3_2, 'conv3_2') #后面的merge_all会把我们写的这些都汇总起来 loss_summary = tf.summary.scalar('loss',loss) accuracy_summary = tf.summary.scalar('accuracy',accuracy) #x_image在程序中被归一化成了(-1,1)的值,但是tf.summary.image用的图是0-255之间,是像素值 #如果直接用的话会出问题,所以要先逆归一化一下 source_image = (x_image + 1)*127.5 inputs_summary = tf.summary.image('inputs_summary', source_image) merged_summary = tf.summary.merge_all() merged_summary_test = tf.summary.merge([loss_summary, accuracy_summary]) LOG_DIR = '.' run_label = 'run_vgg_tensorboard' run_dir = os.path.join(LOG_DIR, run_label) if not os.path.exists(run_dir): os.mkdir(run_dir) train_log_dir = os.path.join(run_dir,'train') test_log_dir = os.path.join(run_dir,'test') if not os.path.exists(train_log_dir): os.mkdir(train_log_dir) init = tf.global_variables_initializer() batch_size = 20 train_steps = 10000 test_steps = 100 output_summary_every_steps = 100 # train 10k: 73.4% with tf.Session() as sess: sess.run(init) #训练集和测试集都分别进行输出,建立2个writer train_writer = tf.summary.FileWriter(train_log_dir, sess.graph) test_writer = tf.summary.FileWriter(test_log_dir) fixed_test_batch_data, fixed_test_batch_labels = test_data.next_batch(batch_size) for i in range(train_steps): batch_data, batch_labels = train_data.next_batch(batch_size) eval_ops = [loss,accuracy,train_op] shoud_output_summary = ((i+1)%output_summary_every_steps == 0) if shoud_output_summary: eval_ops.append(merged_summary) eval_ops_results = sess.run( eval_ops, feed_dict={ x: batch_data, y: batch_labels}) loss_val, acc_val = eval_ops_results[0:2] if shoud_output_summary: train_summary_str = eval_ops_results[-1] train_writer.add_summary(train_summary_str,i+1) test_summary_str = sess.run([merged_summary_test], feed_dict={ x:fixed_test_batch_data, y:fixed_test_batch_labels, })[0] test_writer.add_summary(test_summary_str,i+1) if (i+1) % 100 == 0: print('[Train] Step: %d, loss: %4.5f, acc: %4.5f' % (i+1, loss_val, acc_val)) if (i+1) % 1000 == 0: test_data = CifarData(test_filenames, False) all_test_acc_val = [] for j in range(test_steps): test_batch_data, test_batch_labels \ = test_data.next_batch(batch_size) test_acc_val = sess.run( [accuracy], feed_dict = { x: test_batch_data, y: test_batch_labels }) all_test_acc_val.append(test_acc_val) test_acc = np.mean(all_test_acc_val) print('[Test ] Step: %d, acc: %4.5f' % (i+1, test_acc))
3、fine-tune实战
不使用随机初始化来初始化参数,而是使用之前已经train好的模型来做初始化
步骤:
①保存模型
②恢复模型 restore models checkpoint (也就是断点恢复)
③keep some layers fixed 冻结指定层
finetune是保存底层的参数值不变,只改变上层的参数值
import tensorflow as tf import os import pickle import numpy as np CIFAR_DIR = "dataset/cifar-10-batches-py" print(os.listdir(CIFAR_DIR)) def load_data(filename): """read data from data file.""" with open(filename, 'rb') as f: data = pickle.load(f, encoding='bytes') return data[b'data'], data[b'labels'] # tensorflow.Dataset. class CifarData: def __init__(self, filenames, need_shuffle): all_data = [] all_labels = [] for filename in filenames: data, labels = load_data(filename) all_data.append(data) all_labels.append(labels) self._data = np.vstack(all_data) self._data = self._data / 127.5 - 1 self._labels = np.hstack(all_labels) print(self._data.shape) print(self._labels.shape) self._num_examples = self._data.shape[0] self._need_shuffle = need_shuffle self._indicator = 0 if self._need_shuffle: self._shuffle_data() def _shuffle_data(self): # [0,1,2,3,4,5] -> [5,3,2,4,0,1] p = np.random.permutation(self._num_examples) self._data = self._data[p] self._labels = self._labels[p] def next_batch(self, batch_size): """return batch_size examples as a batch.""" end_indicator = self._indicator + batch_size if end_indicator > self._num_examples: if self._need_shuffle: self._shuffle_data() self._indicator = 0 end_indicator = batch_size else: raise Exception("have no more examples") if end_indicator > self._num_examples: raise Exception("batch size is larger than all examples") batch_data = self._data[self._indicator: end_indicator] batch_labels = self._labels[self._indicator: end_indicator] self._indicator = end_indicator return batch_data, batch_labels train_filenames = [os.path.join(CIFAR_DIR, 'data_batch_%d' % i) for i in range(1, 6)] test_filenames = [os.path.join(CIFAR_DIR, 'test_batch')] train_data = CifarData(train_filenames, True) test_data = CifarData(test_filenames, False) x = tf.placeholder(tf.float32, [None, 3072]) y = tf.placeholder(tf.int64, [None]) # [None], eg: [0,5,6,3] x_image = tf.reshape(x, [-1, 3, 32, 32]) # 32*32 x_image = tf.transpose(x_image, perm=[0, 2, 3, 1]) # conv1: 神经元图, feature_map, 输出图像 conv1_1 = tf.layers.conv2d(x_image, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv1_1') conv1_2 = tf.layers.conv2d(conv1_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv1_2') # 16 * 16 pooling1 = tf.layers.max_pooling2d(conv1_2, (2, 2), # kernel size (2, 2), # stride name = 'pool1') conv2_1 = tf.layers.conv2d(pooling1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv2_1') conv2_2 = tf.layers.conv2d(conv2_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv2_2') # 8 * 8 pooling2 = tf.layers.max_pooling2d(conv2_2, (2, 2), # kernel size (2, 2), # stride name = 'pool2') conv3_1 = tf.layers.conv2d(pooling2, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_1') conv3_2 = tf.layers.conv2d(conv3_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_2') # 4 * 4 * 32 pooling3 = tf.layers.max_pooling2d(conv3_2, (2, 2), # kernel size (2, 2), # stride name = 'pool3') # [None, 4 * 4 * 32] flatten = tf.layers.flatten(pooling3) y_ = tf.layers.dense(flatten, 10) loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_) # y_ -> sofmax # y -> one_hot # loss = ylogy_ # indices predict = tf.argmax(y_, 1) # [1,0,1,1,1,0,0,0] correct_prediction = tf.equal(predict, y) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float64)) with tf.name_scope('train_op'): train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) #name是指定命名空间,为了防止冲突 def variable_summary(var,name): with tf.name_scope(name): mean = tf.reduce_mean(var)#均值 with tf.name_scope('stdddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var-mean))) tf.summary.scalar('mean',mean) tf.summary.scalar('stddev',stddev) tf.summary.scalar('min',tf.reduce_min(var)) tf.summary.scalar('max',tf.reduce_max(var)) tf.summary.histogram('histogram',var)#直方图 with tf.name_scope('summary'): variable_summary(conv1_1, 'conv1_1') variable_summary(conv1_2, 'conv1_2') variable_summary(conv2_1, 'conv2_1') variable_summary(conv2_2, 'conv2_2') variable_summary(conv3_1, 'conv3_1') variable_summary(conv3_2, 'conv3_2') #后面的merge_all会把我们写的这些都汇总起来 loss_summary = tf.summary.scalar('loss',loss) accuracy_summary = tf.summary.scalar('accuracy',accuracy) #x_image在程序中被归一化成了(-1,1)的值,但是tf.summary.image用的图是0-255之间,是像素值 #如果直接用的话会出问题,所以要先逆归一化一下 source_image = (x_image + 1)*127.5 inputs_summary = tf.summary.image('inputs_summary', source_image) merged_summary = tf.summary.merge_all() merged_summary_test = tf.summary.merge([loss_summary, accuracy_summary]) LOG_DIR = '.' run_label = 'run_vgg_tensorboard' run_dir = os.path.join(LOG_DIR, run_label) if not os.path.exists(run_dir): os.mkdir(run_dir) train_log_dir = os.path.join(run_dir,'train') test_log_dir = os.path.join(run_dir,'test') if not os.path.exists(train_log_dir): os.mkdir(train_log_dir) #将模型保存在文件中 model_dir = os.path.join(run_dir,'model') if not os.path.exists(model_dir): os.mkdir(model_dir) #saver就是得到的一个文件句柄,可以帮我们把tensorflow训练过程中的某个快照(包含了所有参数和状态) #给保存到文件中 saver = tf.train.Saver() init = tf.global_variables_initializer() batch_size = 20 train_steps = 10000 test_steps = 100 output_summary_every_steps = 100 output_model_every_steps = 100 # train 10k: 73.4% with tf.Session() as sess: sess.run(init) #训练集和测试集都分别进行输出,建立2个writer train_writer = tf.summary.FileWriter(train_log_dir, sess.graph) test_writer = tf.summary.FileWriter(test_log_dir) fixed_test_batch_data, fixed_test_batch_labels = test_data.next_batch(batch_size) for i in range(train_steps): batch_data, batch_labels = train_data.next_batch(batch_size) eval_ops = [loss,accuracy,train_op] shoud_output_summary = ((i+1)%output_summary_every_steps == 0) if shoud_output_summary: eval_ops.append(merged_summary) eval_ops_results = sess.run( eval_ops, feed_dict={ x: batch_data, y: batch_labels}) loss_val, acc_val = eval_ops_results[0:2] if shoud_output_summary: train_summary_str = eval_ops_results[-1] train_writer.add_summary(train_summary_str,i+1) test_summary_str = sess.run([merged_summary_test], feed_dict={ x:fixed_test_batch_data, y:fixed_test_batch_labels, })[0] test_writer.add_summary(test_summary_str,i+1) if (i+1) % 100 == 0: print('[Train] Step: %d, loss: %4.5f, acc: %4.5f' % (i+1, loss_val, acc_val)) if (i+1) % 1000 == 0: test_data = CifarData(test_filenames, False) all_test_acc_val = [] for j in range(test_steps): test_batch_data, test_batch_labels \ = test_data.next_batch(batch_size) test_acc_val = sess.run( [accuracy], feed_dict = { x: test_batch_data, y: test_batch_labels }) all_test_acc_val.append(test_acc_val) test_acc = np.mean(all_test_acc_val) print('[Test ] Step: %d, acc: %4.5f' % (i+1, test_acc)) if (i+1) % output_model_every_steps == 0: saver.save(sess, os.path.join(model_dir,'ckp-%05d' %(i+1))) print('model saved to ckp-%05d' % (i+1))
ckp是checkpoint的简称
我们让模型停在第6000次训练,acc大概在65%-70%
tensorflow会自动保存最近的5个模型,把先前的给删掉
- data中存储的是参数的数据
- index中存储的是索引信息
- meta中存储的是元信息
下面来看如何去恢复模型
运行后可以看到,一上来模型的accurancy就很高,这就是使用了前面模型训练的结果
第三步,keep some layers fixed 冻结指定层
在模型的计算图里面实现
trainable默认是True,如果设为False,那么这一层layer里面的参数就不参与训练
pytorch中就是 requires_grad=False
我们先把前两层给设成False
运行仍然是可以正常运行的
import tensorflow as tf import os import pickle import numpy as np CIFAR_DIR = "dataset/cifar-10-batches-py" print(os.listdir(CIFAR_DIR)) def load_data(filename): """read data from data file.""" with open(filename, 'rb') as f: data = pickle.load(f, encoding='bytes') return data[b'data'], data[b'labels'] # tensorflow.Dataset. class CifarData: def __init__(self, filenames, need_shuffle): all_data = [] all_labels = [] for filename in filenames: data, labels = load_data(filename) all_data.append(data) all_labels.append(labels) self._data = np.vstack(all_data) self._data = self._data / 127.5 - 1 self._labels = np.hstack(all_labels) print(self._data.shape) print(self._labels.shape) self._num_examples = self._data.shape[0] self._need_shuffle = need_shuffle self._indicator = 0 if self._need_shuffle: self._shuffle_data() def _shuffle_data(self): # [0,1,2,3,4,5] -> [5,3,2,4,0,1] p = np.random.permutation(self._num_examples) self._data = self._data[p] self._labels = self._labels[p] def next_batch(self, batch_size): """return batch_size examples as a batch.""" end_indicator = self._indicator + batch_size if end_indicator > self._num_examples: if self._need_shuffle: self._shuffle_data() self._indicator = 0 end_indicator = batch_size else: raise Exception("have no more examples") if end_indicator > self._num_examples: raise Exception("batch size is larger than all examples") batch_data = self._data[self._indicator: end_indicator] batch_labels = self._labels[self._indicator: end_indicator] self._indicator = end_indicator return batch_data, batch_labels train_filenames = [os.path.join(CIFAR_DIR, 'data_batch_%d' % i) for i in range(1, 6)] test_filenames = [os.path.join(CIFAR_DIR, 'test_batch')] train_data = CifarData(train_filenames, True) test_data = CifarData(test_filenames, False) x = tf.placeholder(tf.float32, [None, 3072]) y = tf.placeholder(tf.int64, [None]) # [None], eg: [0,5,6,3] x_image = tf.reshape(x, [-1, 3, 32, 32]) # 32*32 x_image = tf.transpose(x_image, perm=[0, 2, 3, 1]) # conv1: 神经元图, feature_map, 输出图像 conv1_1 = tf.layers.conv2d(x_image, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, trainable=False, name = 'conv1_1') conv1_2 = tf.layers.conv2d(conv1_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, trainable=False, name = 'conv1_2') # 16 * 16 pooling1 = tf.layers.max_pooling2d(conv1_2, (2, 2), # kernel size (2, 2), # stride name = 'pool1') conv2_1 = tf.layers.conv2d(pooling1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, trainable=False, name = 'conv2_1') conv2_2 = tf.layers.conv2d(conv2_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, trainable=False, name = 'conv2_2') # 8 * 8 pooling2 = tf.layers.max_pooling2d(conv2_2, (2, 2), # kernel size (2, 2), # stride name = 'pool2') conv3_1 = tf.layers.conv2d(pooling2, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_1') conv3_2 = tf.layers.conv2d(conv3_1, 32, # output channel number (3,3), # kernel size padding = 'same', activation = tf.nn.relu, name = 'conv3_2') # 4 * 4 * 32 pooling3 = tf.layers.max_pooling2d(conv3_2, (2, 2), # kernel size (2, 2), # stride name = 'pool3') # [None, 4 * 4 * 32] flatten = tf.layers.flatten(pooling3) y_ = tf.layers.dense(flatten, 10) loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_) # y_ -> sofmax # y -> one_hot # loss = ylogy_ # indices predict = tf.argmax(y_, 1) # [1,0,1,1,1,0,0,0] correct_prediction = tf.equal(predict, y) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float64)) with tf.name_scope('train_op'): train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) #name是指定命名空间,为了防止冲突 def variable_summary(var,name): with tf.name_scope(name): mean = tf.reduce_mean(var)#均值 with tf.name_scope('stdddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var-mean))) tf.summary.scalar('mean',mean) tf.summary.scalar('stddev',stddev) tf.summary.scalar('min',tf.reduce_min(var)) tf.summary.scalar('max',tf.reduce_max(var)) tf.summary.histogram('histogram',var)#直方图 with tf.name_scope('summary'): variable_summary(conv1_1, 'conv1_1') variable_summary(conv1_2, 'conv1_2') variable_summary(conv2_1, 'conv2_1') variable_summary(conv2_2, 'conv2_2') variable_summary(conv3_1, 'conv3_1') variable_summary(conv3_2, 'conv3_2') #后面的merge_all会把我们写的这些都汇总起来 loss_summary = tf.summary.scalar('loss',loss) accuracy_summary = tf.summary.scalar('accuracy',accuracy) #x_image在程序中被归一化成了(-1,1)的值,但是tf.summary.image用的图是0-255之间,是像素值 #如果直接用的话会出问题,所以要先逆归一化一下 source_image = (x_image + 1)*127.5 inputs_summary = tf.summary.image('inputs_summary', source_image) merged_summary = tf.summary.merge_all() merged_summary_test = tf.summary.merge([loss_summary, accuracy_summary]) LOG_DIR = '.' run_label = 'run_vgg_tensorboard' run_dir = os.path.join(LOG_DIR, run_label) if not os.path.exists(run_dir): os.mkdir(run_dir) train_log_dir = os.path.join(run_dir,'train') test_log_dir = os.path.join(run_dir,'test') if not os.path.exists(train_log_dir): os.mkdir(train_log_dir) #将模型保存在文件中 model_dir = os.path.join(run_dir,'model') if not os.path.exists(model_dir): os.mkdir(model_dir) #saver就是得到的一个文件句柄,可以帮我们把tensorflow训练过程中的某个快照(包含了所有参数和状态) #给保存到文件中 saver = tf.train.Saver() #指定要恢复的checkpoint的名字,例如这里我们恢复第6000次的 model_name = 'ckp-06000' model_path = os.path.join(model_dir, model_name) init = tf.global_variables_initializer() batch_size = 20 train_steps = 10000 test_steps = 100 output_summary_every_steps = 100 output_model_every_steps = 100 # train 10k: 73.4% with tf.Session() as sess: sess.run(init) #训练集和测试集都分别进行输出,建立2个writer train_writer = tf.summary.FileWriter(train_log_dir, sess.graph) test_writer = tf.summary.FileWriter(test_log_dir) fixed_test_batch_data, fixed_test_batch_labels = test_data.next_batch(batch_size) #判断模型是否存在 if os.path.exists(model_path + '.index'): saver.restore(sess, model_path) print('model restored from %s' % model_path) else: print('model %s does not exist' % model_path) for i in range(train_steps): batch_data, batch_labels = train_data.next_batch(batch_size) eval_ops = [loss,accuracy,train_op] shoud_output_summary = ((i+1)%output_summary_every_steps == 0) if shoud_output_summary: eval_ops.append(merged_summary) eval_ops_results = sess.run( eval_ops, feed_dict={ x: batch_data, y: batch_labels}) loss_val, acc_val = eval_ops_results[0:2] if shoud_output_summary: train_summary_str = eval_ops_results[-1] train_writer.add_summary(train_summary_str,i+1) test_summary_str = sess.run([merged_summary_test], feed_dict={ x:fixed_test_batch_data, y:fixed_test_batch_labels, })[0] test_writer.add_summary(test_summary_str,i+1) if (i+1) % 100 == 0: print('[Train] Step: %d, loss: %4.5f, acc: %4.5f' % (i+1, loss_val, acc_val)) if (i+1) % 1000 == 0: test_data = CifarData(test_filenames, False) all_test_acc_val = [] for j in range(test_steps): test_batch_data, test_batch_labels \ = test_data.next_batch(batch_size) test_acc_val = sess.run( [accuracy], feed_dict = { x: test_batch_data, y: test_batch_labels }) all_test_acc_val.append(test_acc_val) test_acc = np.mean(all_test_acc_val) print('[Test ] Step: %d, acc: %4.5f' % (i+1, test_acc)) if (i+1) % output_model_every_steps == 0: saver.save(sess, os.path.join(model_dir,'ckp-%05d' %(i+1))) print('model saved to ckp-%05d' % (i+1))
总结一下
finetune可以根据checkpoint的来源不一样分成两部分,也就是一共有两个功能
①如果是别人的模型,别人的checkpoint
我们构建了一个跟他相对应的网络结构,用他的参数初始化这个结构,然后再保存一些值不变,去调试
这就是去微调别人已经train好的一个网络
②model是我自己的,但是中途没train好就停止了,然后断点续传
我们可能在其他的一些实现里面看到它们的finetuning可能会比较简单,这是因为他们做了一些封装。不管他们外层的实现是什么样的,他们的底层的实现肯定是我们说的这几步