在上一部分我们发现严重的过拟合导致了我们的验证集准确率很低。
这次我们将通过扩充数据集以及丢弃正则化(dropout)的方法来降低过拟合带来的影响。
我们先来看一下dropout函数:
import tensorflow as tf
import numpy as np
x = np.array([1,2,3,4,5]).astype('float32')
sess = tf.Session()
print(sess.run(tf.nn.dropout(x, 0.6)))
[0. 3.3333333 0. 6.6666665 8.333333 ]
通过观察,我们很容易就发现了保留下来的数字都被除以了 keep_prob(保留率)中的数字。
丢弃正则化一般应用于全连接层, 主要因为全连接层神经元数量多。 丢弃正则化可以有效的减小我们的模型。
换个角度思考,卷积部分好比我们的眼睛,全连接层好比我们的大脑。大脑运作的时候不会激活全部神经元,丢弃正则化也模仿了大脑的运作。
上一次的过拟合主要因为我们的数据集过小。那么我们先扩充数据集:
基于上一篇cnn文章我们已经将原始数据resize为150*150,而且创建了相应的文件夹。这里我不再贴出之前的代码。
# coding=utf-8
import os
import cv2
import numpy as np
base_dir = './dataset/cats_and_dogs_filtered'
train_dir = os.path.join(base_dir, 'train')
train_cats_dir = os.path.join(train_dir, 'cats/')
train_dogs_dir = os.path.join(train_dir, 'dogs/')
train_cat_fnames = os.listdir(train_cats_dir)
train_dog_fnames = os.listdir(train_dogs_dir)
target_cats = os.path.join(train_dir, 'resize_cats/')
target_dogs = os.path.join(train_dir, 'resize_dogs/')
def tPerspectiveTransform(img, inputs_shape, transform_shape, outputs_shape):
pts1 = np.float32(inputs_shape)
pts2 = np.float32(transform_shape)
M = cv2.getPerspectiveTransform(pts1, pts2)
dst = cv2.warpPerspective(img, M, outputs_shape)
return dst
for i in train_cat_fnames:
img = cv2.imread(train_cats_dir + i, 1)
rows, cols = img.shape[0:2]
rows, cols = round(rows), round(cols)
dst_1 = tPerspectiveTransform(img, [[rows/6, cols/6], [rows*5/6, cols/6],
[rows/6, cols*5/6], [rows*5/6, cols*5/6]],
[[0, 0], [150, 0], [0, 150], [150, 150]], (150, 150))
dst_2 = tPerspectiveTransform(img, [[rows/4, cols/4], [rows*3/4, cols/4],
[rows/4, cols*3/4], [rows*3/4, cols*3/4]],
[[0, 0], [150, 0], [0, 150], [150, 150]], (150, 150))
cv2.imwrite(target_cats + 'pts_1_1' + i, dst_1)
cv2.imwrite(target_cats + 'pts_1_2' + i, dst_2)
M_1 = cv2.getRotationMatrix2D((cols/2, rows/2), 90, 1)
M_2 = cv2.getRotationMatrix2D((cols/2, rows/2), 270, 1)
rotation_1 = cv2.warpAffine(img, M_1, (cols, rows))
resize_ro1 = cv2.resize(rotation_1, dsize=(150, 150))
rotation_2 = cv2.warpAffine(img, M_2, (cols, rows))
resize_ro2 = cv2.resize(rotation_2, dsize=(150, 150))
cv2.imwrite(target_cats + 'rotation_1' + i, resize_ro1)
cv2.imwrite(target_cats + 'rotation_2' + i, resize_ro2)
for i in train_dog_fnames:
img = cv2.imread(train_dogs_dir + i, 1)
rows, cols = img.shape[0:2]
rows, cols = round(rows), round(cols)
dst_1 = tPerspectiveTransform(img, [[rows/6, cols/6], [rows*5/6, cols/6],
[rows/6, cols*5/6], [rows*5/6, cols*5/6]],
[[0, 0], [150, 0], [0, 150], [150, 150]], (150, 150))
dst_2 = tPerspectiveTransform(img, [[rows/4, cols/4], [rows*3/4, cols/4],
[rows/4, cols*3/4], [rows*3/4, cols*3/4]],
[[0, 0], [150, 0], [0, 150], [150, 150]], (150, 150))
cv2.imwrite(target_dogs + 'pts_1_1' + i, dst_1)
cv2.imwrite(target_dogs + 'pts_1_2' + i, dst_2)
M_1 = cv2.getRotationMatrix2D((cols/2, rows/2), 90, 1)
M_2 = cv2.getRotationMatrix2D((cols/2, rows/2), 270, 1)
rotation_1 = cv2.warpAffine(img, M_1, (cols, rows))
resize_ro1 = cv2.resize(rotation_1, dsize=(150, 150))
rotation_2 = cv2.warpAffine(img, M_2, (cols, rows))
resize_ro2 = cv2.resize(rotation_2, dsize=(150, 150))
cv2.imwrite(target_dogs + 'rotation_1' + i, resize_ro1)
cv2.imwrite(target_dogs + 'rotation_2' + i, resize_ro2)
这里我用了将原始图片进行了2次旋转,2次投影。 大家也可以试试其他方法。制作好图片后我们依旧用之前制作tfrecords的代码来制作数据集。
# coding=utf-8
import os
import tensorflow as tf
from PIL import Image
cwd = './dataset/cats_and_dogs_filtered/train/'
classes = ('resize_cats', 'resize_dogs')
writer = tf.python_io.TFRecordWriter('cats_and_dogs_train_onehot.tfrecords')
for index, name in enumerate(classes):
class_path = cwd + name + '/'
for img_name in os.listdir(class_path):
img_path = class_path + img_name
img = Image.open(img_path)
img_raw = img.tobytes()
example = tf.train.Example(features=tf.train.Features(feature={
'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[1, 0]if index==0 else[0, 1])),
'img_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_raw]))
}))
writer.write(example.SerializeToString())
writer.close()
下一步我们来训练我们的神经网络:
# coding=utf-8
import tensorflow as tf
def _parse_function(record):
"""Extracts features and labels.
Args:
record: File path to a TFRecord file
Returns:
A `tuple` `(labels, features)`:
features: A dict of tensors representing the features
labels: A tensor with the corresponding labels.
"""
features = {
"label": tf.FixedLenFeature([2], tf.int64), # terms are strings of varying lengths
"img_raw": tf.FixedLenFeature([], tf.string) # labels are 0 or 1
}
parsed_features = tf.parse_single_example(record, features)
img_raw = parsed_features['img_raw']
img_raw = tf.decode_raw(img_raw, tf.uint8)
img_raw = tf.reshape(img_raw, [150, 150, 3])
labels = parsed_features['label']
return img_raw, labels
def my_input_fn(input_filenames, num_epochs=None, shuffle=True):
# Same code as above; create a dataset and map features and labels.
ds = tf.data.TFRecordDataset(input_filenames)
ds = ds.map(_parse_function)
if shuffle:
ds = ds.shuffle(10000)
# Our feature data is variable-length, so we pad and batch
# each field of the dataset structure to whatever size is necessary.
ds = ds.padded_batch(100, ds.output_shapes)
ds = ds.repeat(num_epochs)
# Return the next batch of data.
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=.1)
return tf.Variable(initial)
def biases_variable(shape):
initial = tf.constant(.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, w):
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def _loss(ys, pred):
cross_entropy = tf.reduce_mean(
-tf.reduce_sum(ys * tf.log(tf.clip_by_value(pred, 1e-10, 1.0)), reduction_indices=[1]))
return cross_entropy
def train_step(learning_rate, loss):
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
return optimizer
def accuracy(pred, ys):
_bool = tf.equal(tf.argmax(pred, 1), tf.argmax(ys, 1))
acc = tf.reduce_mean(tf.cast(_bool, tf.float32))
return acc
train_path = my_input_fn('cats_and_dogs_train_onehot.tfrecords')
xs = train_path[0]
xs = tf.cast(xs, tf.float32)
x_input = xs / 255
ys = train_path[1]
y_input = tf.cast(ys, tf.float32)
w_conv1 = weight_variable([3, 3, 3, 16])
b_conv1 = biases_variable([16])
h_conv1 = tf.nn.relu(conv2d(x_input, w_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
w_conv2 = weight_variable([3, 3, 16, 32])
b_conv2 = biases_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
w_conv3 = weight_variable([3, 3, 32, 64])
b_conv3 = biases_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, w_conv3) + b_conv3)
h_pool3 = max_pool_2x2(h_conv3)
h_pool3_flat = tf.reshape(h_pool3, [-1, 19 * 19 * 64])
w_fc1 = weight_variable([19 * 19 * 64, 512])
b_fc1 = biases_variable([512])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, w_fc1) + b_fc1)
h_fc1_dropout = tf.nn.dropout(h_fc1, 0.5)
w_fc2 = weight_variable([512, 2])
b_fc2 = biases_variable([2])
pred = tf.nn.softmax(tf.matmul(h_fc1_dropout, w_fc2) + b_fc2)
start_learning_rate = .0001
loss = _loss(y_input, pred)
train = train_step(start_learning_rate, loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
saver = tf.train.Saver()
for i in range(3000):
sess.run(train)
save_path = saver.save(sess, 'my_net/simple_cnn1.ckpt')
if i % 50 == 0:
acc = accuracy(pred, y_input)
print('steps:', i, 'accuracy:', sess.run(acc))
这里我batch_size = 100, 反向传播方法为adam, lr =.0001
对输入除以255使其归一化。 dropout添加到了全连接层 保留率为50%。 训练步数为3000步。
在网络结构不变的情况下,上面的参数都会对网络的结果造成影响,如果愿意试的话,可以看看参数对结果的影响。
batch_size 我试过25/50/100,在相同步数下,验证集准确率随着batch_size的增大而增大。
反向传播算法我试了RMSprop和adam。
官方文档中使用的也是RMSprop。我个人也倾向于RMSprop。
adam在该网络中并没有体现出收敛速度快的优势,同时也需要设置一个很低的lr来保证梯度下降的稳定性。( RMSprop可以设置更高的学习率)
(如若出现了CNN将所有图片都分为一类的情况或者不收敛的情况,一般都是lr设置过高造成的,请优先降低lr)
dropout 的保留率一般设置为50%, 保留率过低可能导致网络不收敛,过高则可能导致正则化效果不好。
训练步数 这里3000步其实并不合适。
不过在3000步后,我得到训练集的准确率为87.3%,验证集准确率为73%。(因初始随机数不同,结果可能不同)
也就是说明丢弃正则化并不能完全解决CNN中的过拟合问题。我们依旧需要早停法,或者更大的训练集。
最后附上我们验证用的网络:
# coding=utf-8
import tensorflow as tf
import numpy as np
def _parse_function(record):
"""Extracts features and labels.
Args:
record: File path to a TFRecord file
Returns:
A `tuple` `(labels, features)`:
features: A dict of tensors representing the features
labels: A tensor with the corresponding labels.
"""
features = {
"label": tf.FixedLenFeature([2], tf.int64), # terms are strings of varying lengths
"img_raw": tf.FixedLenFeature([], tf.string) # labels are 0 or 1
}
parsed_features = tf.parse_single_example(record, features)
img_raw = parsed_features['img_raw']
img_raw = tf.decode_raw(img_raw, tf.uint8)
img_raw = tf.reshape(img_raw, [150, 150, 3])
labels = parsed_features['label']
return img_raw, labels
def my_input_fn(input_filenames, num_epochs=None, shuffle=False):
# Same code as above; create a dataset and map features and labels.
ds = tf.data.TFRecordDataset(input_filenames)
ds = ds.map(_parse_function)
if shuffle:
ds = ds.shuffle(10000)
# Our feature data is variable-length, so we pad and batch
# each field of the dataset structure to whatever size is necessary.
ds = ds.padded_batch(25, ds.output_shapes)
ds = ds.repeat(num_epochs)
# Return the next batch of data.
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=.1)
return tf.Variable(initial)
def biases_variable(shape):
initial = tf.constant(.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, w):
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def accuracy(pred, ys):
_bool = tf.equal(tf.argmax(pred, 1), tf.argmax(ys, 1))
acc = tf.reduce_mean(tf.cast(_bool, tf.float32))
return acc
train_path = my_input_fn('cats_and_dogs_train_onehot.tfrecords')
# train_path = my_input_fn('cats_and_dogs_validation_onehot.tfrecords') 验证集
xs = train_path[0]
xs = tf.cast(xs, tf.float32)
x_input = xs / 255
ys = train_path[1]
y_input = tf.cast(ys, tf.float32)
w_conv1 = weight_variable([3, 3, 3, 16])
b_conv1 = biases_variable([16])
h_conv1 = tf.nn.relu(conv2d(x_input, w_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
w_conv2 = weight_variable([3, 3, 16, 32])
b_conv2 = biases_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
w_conv3 = weight_variable([3, 3, 32, 64])
b_conv3 = biases_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, w_conv3) + b_conv3)
h_pool3 = max_pool_2x2(h_conv3)
h_pool3_flat = tf.reshape(h_pool3, [-1, 19 * 19 * 64])
w_fc1 = weight_variable([19 * 19 * 64, 512])
b_fc1 = biases_variable([512])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, w_fc1) + b_fc1)
h_fc1_dropout = tf.nn.dropout(h_fc1, 1) # 验证以及使用的时候,我们不使用丢弃正则化, 丢弃正则化只用在我们训练网络的时候。
w_fc2 = weight_variable([512, 2])
b_fc2 = biases_variable([2])
pred = tf.nn.softmax(tf.matmul(h_fc1_dropout, w_fc2) + b_fc2)
sess = tf.Session()
saver = tf.train.Saver()
saver.restore(sess, 'my_net/simple_cnn1.ckpt')
lst = []
# 验证集较小 range(80)就跑完了整个验证集
for i in range(400):
acc = accuracy(pred, y_input)
lst.append(sess.run(acc))
print(lst)
print(np.mean(lst))
需要注意
丢弃正则化并不会在我们测试准确率以及使用网络中用到,其作为正则化的一种方法只会用在我们训练网络的时候。
(这里我偷懒了,训练用的网络将keep_prob设置为 placeholder更好,这样我们可以通过feed_dict的方法来控制其保留率。不过训练用的准确率我只是为了看网络是否收敛,所以其准确率意义不大。)
这里的batch_size是25, 我们验证集图片为2000张, 所以循环80次就可以得到所有图片的准确率。