用深度学习做图像分类任务摸索中踩了很多坑,也总结出了一些经验。现在将一些自己觉得非常实用的模型训练经验写下来作为记录,也方便后来者借鉴验证。
# 训练集/测试集数据生成器,替换flow_from_directory()
def flow_from_2DList(directory=None, target_size=(256, 256),
color_mode='rgb', classes=None, class_mode='categorical',
batch_size=1, shuffle=True, seed=None, save_to_dir=None,
save_prefix='', save_format='png', follow_links=False,
subset=None, interpolation='nearest'):
"""
A DirectoryIterator yielding tuples of (x, y)
where x is a numpy array containing a batch of images
with shape (batch_size, *target_size, channels) and
y is a numpy array of corresponding labels.
"""
# 每个epoch都要shuffle数据集
random.shuffle(directory)
# 参数初始化
if directory is None: # python函数的默认参数如果是list这种可变类型,
# 需要在函数体内进行初始化,
# 否则会在上次的结果后继续使用list
directory = [ [ 99999 for x in range(4) ] for y in range(batch_size) ]
list_len = len(directory)
print('\nlength of directory:', list_len, '\n\n')
print('\nbatch_size:', batch_size, '\n\n')
step = list_len//batch_size # 向下取整得到一个epoch需要多少个step
print('\nsetp:',step,'\n\n')
for i in range(step):
# 每行一个记录读取训练/测试数据,返回(x,[y1,y2,y3])
batch_images = []
y_label_age = np.zeros((batch_size, 100))
y_label_sex = np.zeros((batch_size, 2))
y_label_sick = np.zeros((batch_size, 2))
batch_directory = directory[i*batch_size : (i+1)*batch_size].copy()
batch_size_num = 0 # 循环计数器
for record in batch_directory:
file_path = record[0]
image = cv2.imread(file_path)
image = cv2.resize(image, target_size)
batch_images.append(image)
age = record[1]
sex = record[2]
sick = record[3]
# 将age,sex,sick转换成one-hot编码
if age != 0:
age -= 1
age = to_categorical(age, num_classes = 100)
sex = to_categorical(sex-1, num_classes = 2)
sick = to_categorical(sick-1, num_classes = 2)
y_label_age[batch_size_num,:] = age
y_label_sex[batch_size_num,:] = sex
y_label_sick[batch_size_num,:] = sick
batch_size_num += 1
batch_images = np.array(batch_images)
y_labels = [y_label_age, y_label_sex, y_label_sick]
data = (batch_images, y_labels)
yield data