在三种框架的使用上,可能最难以上手使用的就是TensorFlow,毕竟在没有Eager Execution时,动态图的特性常常让人对网络调试摸不着头脑。Keras相对来说最容易上手,固定版式的代码,封装性极高,想要扩展对新手来说就有些难了。PyTorch在上手难易程度,扩展性方面都很棒,特别是 torch
往往我们的数据集存储在磁盘是直接以 jpg
或者 png
图片的形式,可能几万几十万张不等,标签信息可能也是图片或者存储在 txt
文档中的数据等等。当然,如果将这些零散的数据整合成类似于 npz
或者 TFRecord
TensorFlow在Dataset API之前,大多都是使用 QueueRunner
去搞定这件事。有兴趣可以去研究,这里随便贴一篇文章。老实说,这样的API有些难用,编码复杂性高,容易出错,至少我在平时的编码中确实会遇到数据读取队列出错的问题。相反PyTorch的数据读取方式就显得非常简单,有面向对象编程的那种感觉。TensorFlow在1.3版本之后引入了全新的读取数据API,也就是Dataset API。总的来说,更加的简洁明了,编码难度降低了很多。同样,这里推荐一篇文章,TensorFlow全新的数据读取方式:Dataset API教程。着重可能需要关注磁盘大数据量的读取和对数据的处理。
贴一个我自己写的代码,用于读取磁盘30万张 jpg
图片和对应 txt
class XxxDataloader:
def __init__(self, config):
self.config = config
self.mode = config.mode
# 数据路径
self.img_path = config.img_path
self.image_names_path = config.image_names_path
self.gt_file = config.gt_file
# 图片数据
self.img_raw_batch = None
self.img_aug_batch = None
# 标签数据
self.gt_batch = None # ground truth
# ===========》开始处理 ===========》
# 读取图像名称和标签,image_names存放的是全部训练数据的
image_names, gt = self._read_img_and_gt(self.image_names_path, self.gt_file)
# 创建dataset, dataset中的一个元素是(image_name,, gt)
dataset = tf.data.Dataset.from_tensor_slices((image_names, pts1_coordinates, gt_h4ps))
# 通过图片名读取图片数据,并对数据进行处理
dataset = dataset.map(self._parse_function)
# 此时dataset中的一个元素是(image_batch, label_batch)
if config.shuffle:
dataset = dataset.shuffle(config.buffersize)
dataset = dataset.batch(config.batch_size).repeat(config.train_epoch)
# 从头到尾读取一次的iterator
iterator = dataset.make_one_shot_iterator()
# 从iterator里取出一个样本
self.img_raw_batch, self.img_aug_batch, self.gt_batch = iterator.get_next()
def _parse_function(self, image_name, gt):
# 获取图片路径,图片所在路径名称都存在一个txt中
image_path = tf.string_join([self.img_path, image_name])
# 读取图片RGB三通道
image = self._read_image(image_path, [self.img_h, self.img_w], channels=3)
# 数据增强
random_aug = tf.random_uniform([], 0, 1)
image_aug = tf.cond(random_aug < self.config.aug_ratio, lambda: self._augment_image(image), lambda: image)
# 归一化等其他操作
return image, image_aug, gt
def _read_img_and_gt(self, filenames_file, gt_file):
读取图像名称数据、起始坐标点和ground truth
:param filenames_file: 保存数据名称文件
:param gt_file: 标签
:return: 图的名称、标签
return img_array, gt_array
def _read_image(self, image_path, out_size, channels=3):
:param image_path: 图片路径
:param out_size: 输出尺寸
:param channels:
image = tf.image.decode_jpeg(tf.read_file(image_path), channels=channels)
image = tf.cast(image, tf.float32)
image = tf.image.resize_images(image, out_size, tf.image.ResizeMethod.AREA)
return image
def _augment_image(self, image, min_val=0, max_val=255):
:param image:
image_aug = aug(image)
return image_aug
其实搭建模型这一块,各个框架都有非常方便的高层API,也都有各种张量操作函数。但是我们希望能各种自定义层,有时候希望某几层权重共享等等。如果使用TensorFlow原生API往往需要考虑各种 namescope
class XxxNet:
def __init__(self, config):
self.config = config
self.model = self.build_model()
def build_model(self):
left_input = Input(shape=(self.config.patch_size, self.config.patch_size, 1), name='left_input')
right_input = Input(shape=(self.config.patch_size, self.config.patch_size, 1), name='right_input')
# concat
stack_input = Concatenate(axis=3)([left_input, right_input])
# block1
conv1_1 = Conv2D(64, (3, 3), strides=(1, 1), padding='same', activation='relu')(stack_input)
conv1_2 = Conv2D(64, (3, 3), strides=(1, 1), padding='same', activation='relu')(conv1_1)
maxpooling1 = MaxPooling2D((2, 2), strides=(2, 2))(conv1_2)
# block2
conv2_1 = Conv2D(64, (3, 3), strides=(1, 1), padding='same', activation='relu')(maxpooling1)
conv2_2 = Conv2D(64, (3, 3), strides=(1, 1), padding='same', activation='relu')(conv2_1)
maxpooling2 = MaxPooling2D((2, 2), strides=(2, 2))(conv2_2)
# block3
conv3_1 = Conv2D(128, (3, 3), strides=(1, 1), padding='same', activation='relu')(maxpooling2)
conv3_2 = Conv2D(128, (3, 3), strides=(1, 1), padding='same', activation='relu')(conv3_1)
maxpooling3 = MaxPooling2D((2, 2), strides=(2, 2))(conv3_2)
# block4
conv4_1 = Conv2D(128, (3, 3), strides=(1, 1), padding='same', activation='relu')(maxpooling3)
conv4_2 = Conv2D(128, (3, 3), strides=(1, 1), padding='same', activation='relu')(conv4_1)
# dropout1
if self.config.mode == "test":
self.config.dropout_rate = 1.0
dropout1 = Dropout(self.config.dropout_rate)(conv4_2)
# flatten
flatten = Flatten()(dropout1)
# fc and dropout2
fc1 = Dense(1024, activation='relu', kernel_initializer='random_uniform')(flatten)
fc1_dropout = Dropout(self.config.dropout_rate)(fc1)
fc2 = Dense(8, activation=None, kernel_initializer='random_uniform')(fc1_dropout)
output = fc2
model = Model([left_input, right_input], output)
plot_model(model, to_file=os.path.join(self.config.model_img_dir, "_model.svg"), show_shapes=True)
return model
其实通过前面Dataset API+Keras模型的方式,我们已经完全可以编写Keras方式的训练代码,直接 model.fit()
等。如果有兴趣,可以参考:tensorflow的keras实现搭配dataset 之二。
config = TrainConfig() # 训练参数配置
dataloader = XxxDataloader(config)
train(config, dataloader, XxxNet(config))
def train(config, dataloader, network):
gt_image = dataloader.img_gt_batch # 标签
pred_image = network.model(dataloader.img_batch) # 预测
l1_loss = tf.reduce_mean(tf.abs(pred_image - gt_image)) # metric
dssim_loss = loss_mix_v3(gt_image, pred_image) # 自定义损失
op = tf.train.AdamOptimizer(learning_rate=config.learning_rate).minimize(dssim_loss)
# 保存学习率/loss值至tensorboard
with tf.device('/cpu:0'):
with tf.name_scope('losses'):
tf.summary.scalar('l1_loss', l1_loss)
tf.summary.scalar('dssim_loss', dssim_loss)
with tf.name_scope('images'):
tf.summary.image('gt_image', gt_image, 1)
tf.summary.image('pred_image', pred_image, 1)
init = tf.global_variables_initializer()
total_step = 0
merged_summary_op = tf.summary.merge_all()
with tf.Session() as sess:
summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph)
while True:
total_step += 1
_, l1_loss_output, dssim_loss_output = sess.run([op, l1_loss, dssim_loss])
print("step: {:d}, l1_loss: {:.4f}, dssim_loss: {:.4f}".format(total_step, l1_loss_output, dssim_loss_output))
if total_step % 100 == 0:
summary_str = sess.run(merged_summary_op)
summary_writer.add_summary(summary_str, total_step)
except tf.errors.OutOfRangeError:
# coding-utf-8
from __future__ import absolute_import, division, print_function
import tensorflow as tf
from tensorflow.keras import Model, layers, Input
import numpy as np
# MNIST dataset parameters.
num_classes = 10 # total classes (0-9 digits).
# Training parameters.
learning_rate = 0.001
training_steps = 200
batch_size = 128
display_step = 10
# Network parameters.
conv1_filters = 32 # number of filters for 1st conv layer.
conv2_filters = 64 # number of filters for 2nd conv layer.
fc1_units = 1024 # number of neurons for 1st fully-connected layer.
# Prepare MNIST data.
from tensorflow.keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Convert to float32.
x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
x_train = np.reshape(x_train, [-1, 28, 28, 1])
x_test = np.reshape(x_test, [-1, 28, 28, 1])
# Normalize images value from [0, 255] to [0, 1].
x_train, x_test = x_train / 255., x_test / 255.
# Use tf.data API to shuffle and batch data.
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
iterator = train_data.make_one_shot_iterator()
batch_x, batch_y = iterator.get_next()
class ConvNetModel:
def __init__(self):
self.model = self.build_model()
def build_model(self):
input = Input(shape=(28, 28, 1))
conv1 = layers.Conv2D(32, kernel_size=5, activation=tf.nn.relu)(input)
maxpool1 = layers.MaxPool2D(2, strides=2)(conv1)
conv2 = layers.Conv2D(64, kernel_size=3, activation=tf.nn.relu)(maxpool1)
maxpool2 = layers.MaxPool2D(2, strides=2)(conv2)
flatten = layers.Flatten()(maxpool2)
fc1 = layers.Dense(1024)(flatten)
dropout = layers.Dropout(rate=0.5)(fc1)
out = layers.Dense(num_classes)(dropout)
output = layers.Softmax()(out)
model = Model(input, output)
return model
# Cross-Entropy Loss.
# Note that this will apply 'softmax' to the logits.
def cross_entropy_loss(x, y):
# Convert labels to int 64 for tf cross-entropy function.
y = tf.cast(y, tf.int64)
# Apply softmax to logits and compute cross-entropy.
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=x)
# Average loss across the batch.
return tf.reduce_mean(loss)
# Accuracy metric.
def accuracy(y_pred, y_true):
# Predicted class is the index of highest score in prediction vector (i.e. argmax).
correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)
pred_x = ConvNetModel().model(batch_x)
loss = cross_entropy_loss(pred_x, batch_y)
acc = accuracy(pred_x, batch_y)
op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
init = tf.global_variables_initializer()
total_step = 0
with tf.Session() as sess:
while True:
total_step += 1
_, loss_val, accu = sess.run([op, loss, acc])
print("step: {:d}, loss: {:.4f}, accuracy: {:.4f}".format(total_step, loss_val, accu))
except tf.errors.OutOfRangeError:
