最近在学TF2.1,之前学习了pytorch,但是感觉很多时候要用到tf,因此现在开始学习关于2.1版本的,作为新手特此记录每一次的代码,这次训练遇到了很多问题,特此记录。
"""GPU训练"""
from myAlextNet_Module import AlexNet
import tensorflow as tf
import json
import os
import time
import glob
import random
import matplotlib.pylab as plt
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# gpus = tf.config.experimental.list_physical_devices("GPU")
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
except RuntimeError as e:
print(e)
exit(-1)
root = os.path.abspath(os.getcwd()) # get root path
image_path = root + '/' + 'flower_data/' # 数据集的路径
train_dir = image_path + 'train'
validation_dir = image_path + 'val'
# 创建文件夹,用于保存输出权重
if not os.path.exists('save_weights'):
os.makedirs("save_weights")
#### 超参数设置
im_height =224
im_width = 224
batch_size = 64
epoch = 10
# 生成标签字典,{key = "idx': val = 'class_name'},用于推理
data_class = [cla for cla in os.listdir(train_dir)]
class_num = len(data_class)
class_dir = dict(zip(data_class, range(class_num)))
invs_class_dir = dict(zip(range(class_num), data_class))
# 写入到json文件中
json_str = json.dumps(invs_class_dir, indent=4) # 对数据进行编码
with open("class_indices.json", "w") as json_file:
json_file.write(json_str)
json_str = json.dumps(invs_class_dir, indent=4)
train_image_list = glob.glob(train_dir + '/*/*.jpg')
random.shuffle(train_image_list)
train_num = len(train_image_list)
# 训练集标签列表
train_label_list = [class_dir[path.split(os.path.sep)[-2]] for path in train_image_list]
val_image_list = glob.glob(validation_dir + '/*/*.jpg')
random.shuffle(val_image_list)
val_num = len(train_image_list)
# 验证集标签列表
val_label_list = [class_dir[path.split(os.path.sep)[-2]] for path in val_image_list]
def process_path(img_path, label):
label = tf.one_hot(label, depth=class_num)
image = tf.io.read_file(img_path)
image = tf.image.decode_jpeg(image, 0)
image = tf.image.convert_image_dtype(image, dtype=tf.float32) # [0,1]
image = tf.image.resize(image, (im_height,im_width))
return image, label
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_list, train_label_list))
train_dataset = train_dataset.shuffle(buffer_size=train_num)\
.map(process_path, num_parallel_calls=AUTOTUNE).repeat()\
.batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((val_image_list, val_label_list))
val_dataset = val_dataset.shuffle(buffer_size=val_num)\
.map(process_path, num_parallel_calls=AUTOTUNE).repeat()\
.batch(batch_size)
# 实例化模型
model = AlexNet(class_num=5)
model.build((batch_size,224,224,3))
model.summary()
# using keras high level api for training
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
metrics=['accuracy'])
def scheduler(epoch):
if epoch < 10:
return 0.01
else:
return 0.001 * tf.math.exp(0.1 * (10 - epoch))
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath='./save_weights/myAlex.h5',
save_best_only=True,
save_weights_only=True,
monitor='val_accuracy'),]
history = model.fit(x=train_dataset,
steps_per_epoch=train_num // batch_size,
epochs=epoch,
validation_data=val_dataset,
validation_steps=val_num // batch_size,
callbacks=callbacks)
# 绘制loss和accuracy 图片
history_dict = history.history
train_loss = history_dict['loss']
train_accuracy = history_dict['accuracy']
val_loss = history_dict['val_loss']
val_accuracy = history_dict['val_accuracy']
# figure 1
plt.figure()
plt.plot(range(epoch), train_loss, label='train_loss')
plt.plot(range(epoch), val_loss, label='val_loss')
plt.legend() # 添加图例,label
plt.xlabel('epochs')
plt.ylabel('loss')
# figure 2
plt.figure()
plt.plot(range(epoch), train_accuracy, label='accuracy')
plt.plot(range(epoch), val_accuracy, label='val_accuracy')
plt.legend()
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.show()
预测过程:这里要注意了,如果采用子类模型,实例化的时候一定要build一下,只有build了模型才真正实例化
"""预测predict"""
from myAlextNet_Module import AlexNet
from PIL import Image
import numpy as np
import json
import matplotlib.pyplot as plt
import os
import tensorflow as tf
im_height = 224
im_width = 224
batch_size=16
# load image,修改图片尺寸
img = Image.open('./159079265_d77a9ac920_n.jpg')
# resize image to 224x224
img = img.resize((im_width, im_height))
plt.imshow(img)
# scaling pixel value to (0-1)
img = np.array(img) / 255.
# Add the image to a batch where it's the only member.
img = (np.expand_dims(img, 0))
# 加载json文件字典
try:
json_file = open('./class_indices.json', 'r')
class_indict = json.load(json_file)
except Exception as e:
print(e)
exit(-1)
# 加载模型
model = AlexNet(class_num=5)# 加载模型权重文件
model.build((batch_size,224,224,3)) # 真正意义上的实例化模型
model.load_weights("./save_weights/myAlex.h5",)
predict = np.squeeze(model.predict(img))
predict_class = np.argmax(predict)
print(class_indict[str(predict_class)], predict[predict_class])
plt.show()
Note:
在数据读取过程中,采用的是tf.data.Dataset生成器,而不是keras下的ImageDataGenerator,前面的方法速度更快,因为可以使用多线程
在网络训练耗时上,CPU负责图像读取以及预处理,GPU负责对网络进行训练,使用Keras数据生成器无法多线程读取数据,GPU在计算时间很快,大部分时间在读取数据,因此大大降低了GPU的利用率,如果使用tf.data.Dataset,他能够让CPU多线程去读取数据,这样加快了读取时间,而不耽误GPU的利用率