大部分时候,我们希望使用自己的数据集来训练模型。然而,面对一堆格式不一的原始数据文件,将其预处理并读入程序的过程往往十分繁琐,甚至比模型的设计还要耗费精力。比如,为了读入一批图像文件,我们可能需要纠结于 python 的各种图像处理包(比如 pillow ),自己设计 Batch 的生成方式,最后还可能在运行的效率上不尽如人意。为此,TensorFlow 提供了 tf.data 这一模块,包括了一套灵活的数据集构建 API,能够帮助我们快速、高效地构建数据输入的流水线,尤其适用于数据量巨大的场景。
tf.data 的核心是 tf.data.Dataset 类,提供了对数据集的高层封装。
tf.data.Dataset 由一系列的可迭代访问的元素(element)组成,每个元素包含一个或多个张量。比如说,对于一个由图像组成的数据集,每个元素可以是一个形状为 长×宽×通道数 的图片张量,也可以是由图片张量和图片标签张量组成的元组(Tuple)。
1、 tf.data.Dataset.from_tensor_slices()
最基础的建立 tf.data.Dataset 的方法是使用 tf.data.Dataset.from_tensor_slices() ,**适用于数据量较小(能够整个装进内存)**的情况。
import tensorflow as tf
import numpy as np
X = tf.constant([2015, 2016, 2017, 2018, 2019])
Y = tf.constant([12000, 14000, 15000, 16500, 17500])
# 也可以使用NumPy数组,效果相同
# X = np.array([2015, 2016, 2017, 2018, 2019])
# Y = np.array([12000, 14000, 15000, 16500, 17500])
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
for x, y in dataset:
print(x.numpy(), y.numpy())
输出:
2013 12000
2014 14000
2015 15000
2016 16500
2017 17500
import matplotlib.pyplot as plt
# MNIST 数据集
(train_data, train_label), (_, _) = tf.keras.datasets.mnist.load_data()
# [60000, 28, 28, 1]
train_data = np.expand_dims(train_data.astype(np.float32) / 255.0, axis=-1)
mnist_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))
for image, label in mnist_dataset:
print(label.numpy())
print(image.numpy())
使用datasets模块
# 进行h5数据的读取
import numpy as np
import h5py
def load_dataset():
train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
classes = np.array(test_dataset["list_classes"][:]) # the list of classes
print("原始训练数据x的shape:", train_set_x_orig.shape)
print("原始训练数据y的shape:", train_set_y_orig.shape)
print("原始测试数据x的shape:", test_set_x_orig.shape)
print("原始测试数据y的shape:", test_set_y_orig.shape)
# 把列转换为行
train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
print("数据转换后训练数据y的shape:", train_set_y_orig.shape)
return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
import tensorflow as tf
import os
#定义一个数据读取的函数,读取指定文件夹下的数据,制作样本数据集
def read_image_filename(data_dir):
building_dir=data_dir+'building/' #得到建筑的文件夹
cloud_dir=data_dir+'cloud/'
farmland_dir = data_dir + 'farmland/'
health_dir = data_dir + 'health/'
infected_dir = data_dir + 'infected/'
water_dir = data_dir + 'water/'
#构建特征数据集,值为对应的图片文件名
building_filenames=tf.constant([building_dir+fn for fn in os.listdir(building_dir)]) #猫的文件名
cloud_filenames=tf.constant([cloud_dir+fn for fn in os.listdir(cloud_dir)])
farmland_filenames = tf.constant([farmland_dir + fn for fn in os.listdir(farmland_dir)])
health_filenames = tf.constant([health_dir + fn for fn in os.listdir(health_dir)])
infected_filenames = tf.constant([infected_dir + fn for fn in os.listdir(infected_dir)])
water_filenames = tf.constant([water_dir + fn for fn in os.listdir(water_dir)])
filenames=tf.concat([building_filenames,cloud_filenames,farmland_filenames,health_filenames,infected_filenames,water_filenames],axis=-1) #对矩阵按行结合
#构建标签数据集,build为0,cloud为1
labels=tf.concat([
tf.zeros(building_filenames.shape,dtype=tf.int32), #猫的数量,并赋值相应数量的0
tf.ones(cloud_filenames.shape,dtype=tf.int32),
tf.fill(farmland_filenames.shape,2),
tf.fill(health_filenames.shape,3),
tf.fill(infected_filenames.shape,4),
tf.fill(water_filenames.shape,5)],
axis=-1) #安行结合
return filenames,labels
#解码图片并调整图片大小
def decode_image_and_resize(filename,label):
image_string=tf.io.read_file(filename) #读取原始文件
#问题1
image_decoded=tf.image.decode_jpeg(image_string) #解码JPEG图片
#调整图像大小,要和后面模型输入要求一致,并进行标准化
image_resized=tf.image.resize(image_decoded,[224,224])/255.0
return image_resized,label
"""
train_data_dir='./smalldata/train/' #文件夹
filenames,labels=read_image_filename(train_data_dir)
dataset=tf.data.Dataset.from_tensor_slices((filenames,labels)) #构建数据集
####print(filenames,labels)
#######print(dataset)
sub_dataset=dataset.take(3) #取出前三项
for x,y in sub_dataset:
print('filename:',x.numpy(),'label:',y.numpy())
"""
#对数据进行预处理
def prepare_dataset(data_dir,buffer_size=2000,batch_size=16):
filenames,labels=read_image_filename(data_dir)
print(filenames.shape)
print(labels.shape)
dataset=tf.data.Dataset.from_tensor_slices((filenames,labels))
dataset=dataset.map(
map_func=decode_image_and_resize, #对dataset中的数据统一进行相同处理
num_parallel_calls=tf.data.experimental.AUTOTUNE
)
dataset=dataset.shuffle(buffer_size)#打乱
dataset=dataset.batch(batch_size) #划好批次
dataset=dataset.prefetch(tf.data.experimental.AUTOTUNE)
return dataset
#vgg16模型
def vgg16_model(input_shape=(224,224,3)):
vgg16=tf.keras.applications.vgg16.VGG16(include_top=False,
weights='imagenet',
input_shape=input_shape)
for layer in vgg16.layers:
layer.trainable=False #设置vgg-16预训练模型不可训练
last=vgg16.output
#加入剩下未经训练的全连接层
x=tf.keras.layers.Flatten()(last)
x=tf.keras.layers.Dense(128,activation='relu')(x)
x=tf.keras.layers.Dropout(0.3)(x)
x=tf.keras.layers.Dense(32,activation='relu')(x)
x=tf.keras.layers.Dropout(0.3)(x)
x=tf.keras.layers.Dense(6,activation='softmax')(x)
#建立新的模型
model=tf.keras.models.Model(inputs=vgg16.input,outputs=x)
model.summary()
return model
#建立模型,模型设置
model=vgg16_model()
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
#训练模型
#下面四步读取训练数据
#定义目录
train_data_dir='./dataset/train/'
#定义缓存大小,用于打乱
buffer_size=10000
#批次大小,每个批次多少样本数
batch_size=16
#训练用的数据集
dataset_train=prepare_dataset(train_data_dir,buffer_size,batch_size)
#定义超参数
training_epochs=4
#进行训练,训练数据,数据训练多少轮
train_history=model.fit(dataset_train,epochs=training_epochs,verbose=1)
######模型存储
#将模型结构和模型权重参数分开存储
#模型结构存储再.yaml文件中
yaml_string=model.to_yaml()
with open('./models/cat_dog.yaml','w') as model_file:
model_file.write(yaml_string)
#模型权重参数存储在.h5文件中
model.save_weights('./models/cat_dog.h5')
print("模型保存完毕!")
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import time
"""
下面2行代码是解决这个问题的Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node model_1/model/block1_conv1/Conv2D (defined at G:/XiaoMa/Bursxylophilus/310FCN/app.py:44) ]] [Op:__inference_predict_function_1613]
"""
physical_device = tf.config.experimental.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_device[0], True)
# 开始计时
time_start=time.time()
# 获取所有图像路径
images = glob.glob(r'G:\XiaoMa\08OwnWork\zhongxian\语义分割数据\images\*.jpg')
len(images)
anno = glob.glob(r'G:\XiaoMa\08OwnWork\zhongxian\语义分割数据\单波段处理后的png\*.png')
print(anno[-5:])
print(images[-5:])
# 进行乱序,inages和anno必须保持一致
np.random.seed(2019) # 随机数种子
index = np.random.permutation(len(images)) # 随机数的索引,随机排列序列,https://blog.csdn.net/weixin_44188264/article/details/93752505
images = np.array(images)[index]
anno = np.array(anno)[index]
print(anno[-5:])
print(images[-5:]) # 查看标签和影像是否是一一对应
# 将读取的图片转换为数据集
dataset = tf.data.Dataset.from_tensor_slices((images,anno))
test_count = int(len(images)*0.2) # 一部分为测试集
train_count = len(images)-test_count # 一部分为训练集
print("测试和训练数据集数量:")
print(test_count,train_count)
data_train = dataset.skip(test_count) # 跳过多少个进行选取
data_test = dataset.take(test_count) # 选取多少个
def read_jpg(path):
"""读取并解码jpg图像"""
img_de = tf.io.read_file(path)
img_de = tf.image.decode_jpeg(img_de,channels=3)
return img_de
def read_png(path):
"""读取并解码png图像"""
img_de_png = tf.io.read_file(path)
img_de_png = tf.image.decode_png(img_de_png,channels=1)
return img_de_png
def normal_img(input_images,input_anno):
"""数据归一化"""
input_images = tf.cast(input_images,tf.float32)
input_images = input_images/127.5-1
input_anno = tf.cast(input_anno, tf.float32)
input_anno = input_anno/255.0
return input_images,input_anno
def load_images(input_images_path,input_anno_path):
"""加载图片并改变图像大小"""
input_image = read_jpg(input_images_path)
input_anno = read_png(input_anno_path)
input_image = tf.image.resize(input_image,(608,608)) # 这个resize()的原理
input_anno = tf.image.resize(input_anno,(608,608))
return normal_img(input_image,input_anno)
# 对图像进行预处理
data_train = data_train.map(load_images) # map()函数是,对所有数据用某个函数进行处理
data_test = data_test.map(load_images)
BATCH_SIZE = 2
# repeat()函数就是对数据集进行重复,防止将数据读取完 https://blog.csdn.net/seuzhouchenglong/article/details/104047784
# shuffle()函数就是将数据打乱
data_train = data_train.repeat().shuffle(30).batch(BATCH_SIZE)
data_test = data_test.batch(BATCH_SIZE)
#### 上面已经加预处理好了数据
## 1.1 读取图像并进行打乱,生成数据集,这里是随机挑选了验证集
imgs = glob.glob(r'H:\01HTutorWork\3GF2\2DataAndLabel\4Gaofen2\2arcgislLuotian\4zhencaise\finall_imgs_Positive_JPG\*.jpg')
labels = glob.glob(r'H:\01HTutorWork\3GF2\2DataAndLabel\4Gaofen2\2arcgislLuotian\4zhencaise\finall_labels_Positive_PNG\*.png')
# 这里注意一定要让图片和标签一一对应,本例中通过验证是对应的,但是一般都需要安名称进行重新排序,这样确保一致
# 这里进行一个乱序,为了让图像训练时,不至于每个批次的图像属于同一类,
# 当然语义分割中不需要进行乱序,因为本身每张图像就包括各种类型,本例中是因为有17个城市拍的照片,所以还是进行了排序
print(len(imgs),len(labels))
index= np.random.permutation(len(imgs))
imgs = np.array(imgs)[index]
labels = np.array(labels)[index]
# 检验标签是否对齐
for i in range(len(imgs)):
img = imgs[i].split("\\")[-1]
label = labels[i].split("\\")[-1]
if img[:-4]!=label[:-4]:
print("出错了"*1000)
train_number = int(len(imgs)*0.8)
imgs_val = imgs[train_number:]
labels_val = labels[train_number:]
imgs = imgs[:train_number]
labels = labels[:train_number]
# 检验标签是否对齐
for i in range(len(imgs_val)):
kk_img = imgs_val[i].split("\\")[-1]
kk_label = labels_val[i].split("\\")[-1]
if kk_img[:-4]!=kk_label[:-4]:
print("出错了"*1000)
imgs = glob.glob(r'H:\01HTutorWork\3GF2\3Code\PestDetect3\dataset\train\images\*.jpg')
labels = glob.glob(r'H:\01HTutorWork\3GF2\3Code\PestDetect3\dataset\train\labels\*.png')
## 验证集单独存在一个文件夹里
imgs_val = glob.glob(r'H:\01HTutorWork\3GF2\3Code\PestDetect3\dataset\valid\val_images\*.jpg')
labels_val = glob.glob(r'H:\01HTutorWork\3GF2\3Code\PestDetect3\dataset\valid\val_labels\*.png')
# 这里注意一定要让图片和标签一一对应,本例中通过验证是对应的,但是一般都需要安名称进行重新排序,这样确保一致
# 这里进行一个乱序,为了让图像训练时,不至于每个批次的图像属于同一类,
# 当然语义分割中不需要进行乱序,因为本身每张图像就包括各种类型,本例中是因为有17个城市拍的照片,所以还是进行了排序
index= np.random.permutation(len(imgs))
imgs = np.array(imgs)[index]
labels = np.array(labels)[index]
# 检验标签是否对齐
for i in range(len(imgs)):
img = imgs[i].split("\\")[-1]
label = labels[i].split("\\")[-1]
if img[:-4]!=label[:-4]:
print("出错了"*1000)
index= np.random.permutation(len(imgs_val))
imgs_val = np.array(imgs_val)[index]
labels_val = np.array(labels_val)[index]
# 检验标签是否对齐
for i in range(len(imgs_val)):
kk_img = imgs_val[i].split("\\")[-1]
kk_label = labels_val[i].split("\\")[-1]
if kk_img[:-4]!=kk_label[:-4]:
print("出错了"*1000)
# 下面两个是文件夹
train_img_path = r"H:\01HTutorWork\finall_imgs_Positive_JPG"
train_label_path = r"H:\01HTutorWork\finall_imgs_Positive_PNG"
train_imgs = os.listdir(train_img_path)
train_labels = os.listdir(train_label_path)
train_image_paths = [os.path.join(train_img_path, imgname) for imgname in train_imgs]
train_label_paths = [os.path.join(train_label_path, labelname) for labelname in train_labels]
# 设置随机数种子,之后打乱数据集
np.random.seed(0)
index = np.random.permutation(len(train_image_paths))
train_image_paths = np.array(train_image_paths)[index].tolist() # .tolist() 由数组变为列表
train_label_paths = np.array(train_label_paths)[index].tolist()
# 如果验证集为None, 就从训练集中拿出20%当作验证集
if val_img_path == None and val_label_path==None:
print("1训练集拆分验证集")
train_number = int(len(train_image_paths) * 0.8)
val_image_paths = train_image_paths[train_number:]
val_label_paths = train_label_paths[train_number:]
train_image_paths = train_image_paths[:train_number]
train_label_paths = train_label_paths[:train_number]
else:
val_images = os.listdir(val_img_path)
val_labels = os.listdir(val_label_path)
val_image_paths = [os.path.join(val_img_path, imgname) for imgname in val_images]
val_label_paths = [os.path.join(val_label_path, imgname) for imgname in val_labels]
#print("2大小:",len(train_image_paths),len(train_label_paths),len(val_image_paths),len(val_label_paths))
# 检查标签和图像一一对齐
for i in range(len(train_image_paths)):
img = train_image_paths[i].split("\\")[-1]
label = train_label_paths[i].split("\\")[-1]
if img[:-4] != label[:-4]:
print("出错了" * 1000)
return None
for i in range(len(val_image_paths)):
img = val_image_paths[i].split("\\")[-1]
label = val_label_paths[i].split("\\")[-1]
if img[:-4] != label[:-4]:
print("出错了" * 1000)
return None
(3条消息) 遥感识别10——多波段遥感图像识别全流程(tf2.0)_xiaotiig的博客-CSDN博客
https://blog.csdn.net/xiaotiig/article/details/122347683
(1)获取数据
(2)查看是否有缺失值和缺失值替换
(3)数据最大最小值
(4)数据分布
(5)数据的归一化
(6)数据和标签是否对应
(7)图像大小resize
(8)图像增强
(9)图像随机打乱(防止每个批次的图像是一个类型,不容易拟合)
def rot90(image, label):
image = tf.image.rot90(image)
return image, label
mnist_dataset = mnist_dataset.map(rot90)
for image, label in mnist_dataset:
plt.title(label.numpy())
plt.imshow(image.numpy()[:, :, 0])
plt.show()
mnist_dataset = mnist_dataset.batch(4)
for images, labels in mnist_dataset:
fig, axs = plt.subplots(1, 4)
for i in range(4):
axs[i].set_title(labels.numpy()[i])
axs[i].imshow(images.numpy()[i, :, :, 0])
plt.show()
设定一个固定大小为 buffer_size 的缓冲区(Buffer);初始化时,取出数据集中的前 buffer_size 个元素放入缓冲区;
每次需要从数据集中取元素时,即从缓冲区中随机采样一个元素并取出,然后从后续的元素中取出一个放回到之前被取出的位置,以维持缓冲区的大小。
mnist_dataset = mnist_dataset.shuffle(buffer_size=10000).batch(4)
for images, labels in mnist_dataset:
fig, axs = plt.subplots(1, 4)
for i in range(4):
axs[i].set_title(labels.numpy()[i])
axs[i].imshow(images.numpy()[i, :, :, 0])
plt.show()
进行图像的形状,最大值最小值查看
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
img = tf.io.read_file(r'H:\test.png')
img = tf.image.decode_png(img) # 解码图像
print("课程的格式:")
print(img.shape)
img = tf.squeeze(img) # 把3维的图像压缩成2维,就是把维度为1的都压缩了
print(img.shape) # 图像的形状大小
print(img.numpy().max()) # 图像像元值的最大值
np.unique(img.numpy()) # 查看图像像元值的具体值有哪些
plt.imshow(img) # 加载图像
plt.show() # 显示图像
img2 = tf.io.read_file(r'E:\XiaoMa\Bursxylophilus\dataset\SemSegdataset\ceshi\onepng\1_1zx53.png')
img2 = tf.image.decode_png(img2) # 解码图像
print("转成灰度图像的格式:")
print('转成灰度图像:',img2.shape)
img2 = tf.squeeze(img2) # 把3维的图像压缩成2维,就是把维度为1的都压缩了
print(img2.shape) # 图像的形状大小
print(img2.numpy().max()) # 图像像元值的最大值
print("像元值具体有哪些:",np.unique(img2.numpy()))
# 查看图像像元值的具体值有哪些
plt.imshow(img2) # 加载图像
plt.show() # 显示图像
img3 = tf.io.read_file(r'E:\XiaoMa\Bursxylophilus\dataset\SemSegdataset\pngs\1_1zx53.png')
img3 = tf.image.decode_png(img3) # 解码图像
print("三个通道的影像:")
print('刚标注好的:',img3.shape)
img3 = tf.squeeze(img3) # 把3维的图像压缩成2维,就是把维度为1的都压缩了
print(img3.shape) # 图像的形状大小
print(img3.numpy().max()) # 图像像元值的最大值
np.unique(img3.numpy()) # 查看图像像元值的具体值有哪些
plt.imshow(img3) # 加载图像
plt.show() # 显示图像
"""
# img1 = tf.io.read_file(r'E:\XiaoMa\Bursxylophilus\dataset\SemSegdataset\images\1_1zx53.jpg')
img1 = tf.io.read_file(r'E:\XiaoMa\Bursxylophilus\dataset\SemSegdataset\ceshi\images\1_1zx53.jpg')
img1 = tf.image.decode_png(img1) # 解码图像
print(img1.shape)
img2 = tf.io.read_file(r'E:\XiaoMa\Bursxylophilus\dataset\SemSegdataset\ceshi\onepng\1_1zx53.png')
# img2 = tf.image.decode_png(img2) # 解码图像
# print('第一次解码:',img2.shape)
img2 = tf.image.decode_png(img2, channels=3)
print('第二次解码:',img2.shape)
input_anno = img2 / 128
input_anno = tf.image.resize(input_anno,(224,224))
plt.figure()
plt.subplot(1, 2, 1)
plt.imshow(input_anno)
plt.subplot(1, 2, 2)
plt.imshow(img2)
plt.show()
"""