针对同样的fashion_mnist数据集,对比Batch Normalization和Layer Normalization的实验区别,实验代码如下:
import tensorflow as tf
from tensorflow.keras import datasets,Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Flatten, Dense
import numpy as np
# Model configuration
batch_size = 256
no_epochs = 15
no_classes = 10
validation_split = 0.2
verbosity = 1
#载入fashion_mnist数据集
(x_train, y_train), (x_test, y_test) = datasets.fashion_mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
#reshape数据集到NHWC
x_train = np.expand_dims(x_train, axis=-1) # NHWC
x_test = np.expand_dims(x_test, axis=-1) # NHWC
print(x_train.shape)
# Keras default data_format:channels_last (default) (batch_size, height, width, channels)
batch_norm_model = Sequential(name='batch_norm')
batch_norm_model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28,28,1)))
batch_norm_model.add(BatchNormalization())
batch_norm_model.add(MaxPooling2D(pool_size=(2, 2)))
batch_norm_model.add(BatchNormalization())
batch_norm_model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
batch_norm_model.add(BatchNormalization())
batch_norm_model.add(MaxPooling2D(pool_size=(2, 2)))
batch_norm_model.add(BatchNormalization())
batch_norm_model.add(Flatten())
batch_norm_model.add(Dense(256, activation='relu'))
batch_norm_model.add(BatchNormalization())
batch_norm_model.add(Dense(no_classes, activation='softmax'))
batch_norm_model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=['accuracy'])
batch_norm_hist = batch_norm_model.fit(x_train, y_train,
batch_size=batch_size,
epochs=no_epochs,
verbose=verbosity,
validation_split=validation_split)
loss, acc = batch_norm_model.evaluate(x_test, y_test, verbose=0)
print(f'Test loss: {loss} / Test accuracy: {acc}')
layer_norm_model = tf.keras.models.Sequential([
# Reshape into "channels last" setup.
tf.keras.layers.Reshape((28,28,1), input_shape=(28,28)),
tf.keras.layers.Conv2D(filters=10, kernel_size=(3,3),data_format="channels_last"),
# LayerNorm Layer
tf.keras.layers.LayerNormalization(axis=3 , center=True , scale=True),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])
layer_norm_model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
layer_norm_hist = layer_norm_model.fit(x_train, y_train,
batch_size=batch_size,
epochs=no_epochs,
verbose=verbosity,
validation_split=validation_split)
loss, acc = layer_norm_model.evaluate(x_test, y_test, verbose=0)
print(f'Test loss: {loss} / Test accuracy: {acc}')
# 训练过程可视化:绘制训练曲线
acc = batch_norm_hist.history['accuracy']
val_acc = batch_norm_hist.history['val_accuracy']
loss = batch_norm_hist.history['loss']
val_loss = batch_norm_hist.history['val_loss']
acc1 = layer_norm_hist.history['accuracy']
val_acc1 = layer_norm_hist.history['val_accuracy']
loss1 = layer_norm_hist.history['loss']
val_loss1 = layer_norm_hist.history['val_loss']
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 8))
plt.subplot(1,2,1)
plt.plot(acc, label='BN acc')
plt.plot(acc1, label='LN acc')
plt.plot(loss, label='BN loss')
plt.plot(loss1, label='LN loss')
plt.legend()
plt.subplot(1,2,2)
plt.plot(val_acc, label='BN val_acc')
plt.plot(val_acc1, label='LN val_acc')
plt.plot(val_loss, label='BN val_loss')
plt.plot(val_loss1, label='LN val_loss')
plt.legend()
plt.show()
实验数据:
BN:Test loss: 0.4654472768306732 / Test accuracy: 0.8916000127792358
LN:Test loss: 0.4098794460296631 / Test accuracy: 0.8877000212669373
实验结论:
- 在CNN上, batch normalization表现要比layer normalization好
- Batch Normalization 依赖于 mini-batch size,当mini-batch size比较小时,效果不好
- Layer Normalization不依赖于mini-batch size
- Batch Normalization 在CNN和FC上效果好,在RNN上效果差
- Layer Normalization 在CNN和FC上效果不如Batch Normalization,在RNN上效果比Batch Normalization差