把一大堆不同格式的数据进行统一处理。
为了高效地读取数据,可以将数据进行序列化存储,这样也便于网络流式读取数据。TFRecord是一种比较常用的存储二进制序列数据的方法
tf.Example类是一种将数据表示为{“string”: value}形式的meassage类型,Tensorflow经常使用tf.Example来写入、读取TFRecord数据
通常情况下,tf.Example中可以使用以下几种格式:
tf.train.BytesList: 可以使用的类型包括 string和byte
tf.train.FloatList: 可以使用的类型包括 float和double
tf.train.Int64List: 可以使用的类型包括 enum,bool, int32, uint32, int64
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tensorflow as tf
def _bytes_feature(value):
"""Returns a bytes_list from a string/byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Return a float_list form a float/double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Return a int64_list from a bool/enum/int/uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
# tf.train.BytesList
print(_bytes_feature(b'test_string'))
print(_bytes_feature('test_string'.encode('utf8')))
# tf.train.FloatList
print(_float_feature(np.exp(1)))
# tf.train.Int64List
print(_int64_feature(True))
print(_int64_feature(1))
def serialize_example(feature0, feature1, feature2, feature3):
"""
创建tf.Example
"""
# 转换成相应类型
feature = {
'feature0': _int64_feature(feature0),
'feature1': _int64_feature(feature1),
'feature2': _bytes_feature(feature2),
'feature3': _float_feature(feature3),
}
# 使用tf.train.Example来创建
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
# SerializeToString方法转换为二进制字符串
return example_proto.SerializeToString()
# 数据量
n_observations = int(1e4)
# Boolean feature
feature0 = np.random.choice([False, True], n_observations)
# Integer feature
feature1 = np.random.randint(0, 5, n_observations)
# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]
# Float feature
feature3 = np.random.randn(n_observations)
filename = 'tfrecord' #写到该文件
with tf.io.TFRecordWriter(filename) as writer:
for i in range(n_observations):
example = serialize_example(feature0[i], feature1[i], feature2[i], feature3[i])
writer.write(example)
filenames = [filename]
# 读取
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset
import os
import glob
from datetime import datetime
import tensorflow as tf
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
image_path = './数据增强/input/input2/'
images = glob.glob(image_path + '*.jpg')
for fname in images:
image = mpimg.imread(fname)
f, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
f.subplots_adjust(hspace=.2, wspace=.05)
ax1.imshow(image)
ax1.set_title('Image', fontsize=20)
image_labels = {
'dog': 0,
'pear': 1,
}
# 读数据,binary格式
image_string = open('./数据增强/input/input2/dog.jpg', 'rb').read()
label = image_labels['dog']
def _bytes_feature(value):
"""Returns a bytes_list from a string/byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Return a float_list form a float/double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Return a int64_list from a bool/enum/int/uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
# 创建图像数据的Example
def image_example(image_string, label):
image_shape = tf.image.decode_jpeg(image_string).shape
feature = {
'height': _int64_feature(image_shape[0]),
'width': _int64_feature(image_shape[1]),
'depth': _int64_feature(image_shape[2]),
'label': _int64_feature(label),
'image_raw': _bytes_feature(image_string),
}
return tf.train.Example(features=tf.train.Features(feature=feature))
image_example_proto = image_example(image_string, label)
for line in str(image_example_proto).split('\n')[:15]:
print(line)
print('...')
# 制作 `images.tfrecords`. 现在开始制作~~~~~~~
image_path = './数据增强/input/input2/'
images = glob.glob(image_path + '*.jpg')
record_file = 'images.tfrecord'
counter = 0
with tf.io.TFRecordWriter(record_file) as writer:
for fname in images:
with open(fname, 'rb') as f:
image_string = f.read()
label = image_labels[os.path.basename(fname).replace('.jpg', '')]
# `tf.Example`
tf_example = image_example(image_string, label)
# 将`tf.example` 写入 TFRecord,转成二进制字符串
writer.write(tf_example.SerializeToString())
counter += 1
print('Processed {:d} of {:d} images.'.format(
counter, len(images)))
print(' Wrote {} images to {}'.format(counter, record_file))
#加载制作好的TFRecord~~~~~~
raw_train_dataset = tf.data.TFRecordDataset('images.tfrecord')
raw_train_dataset
# 解析的格式需要跟之前创建example时一致
image_feature_description = {
'height': tf.io.FixedLenFeature([], tf.int64),
'width': tf.io.FixedLenFeature([], tf.int64),
'depth': tf.io.FixedLenFeature([], tf.int64),
'label': tf.io.FixedLenFeature([], tf.int64),
'image_raw': tf.io.FixedLenFeature([], tf.string),
}
def parse_tf_example(example_proto): #这个函数是只对一个例子进行解析
# 解析出来
parsed_example = tf.io.parse_single_example(example_proto, image_feature_description)
# 预处理
x_train = tf.image.decode_jpeg(parsed_example['image_raw'], channels=3)
x_train = tf.image.resize(x_train, (416, 416))
x_train /= 255.
lebel = parsed_example['label']
y_train = lebel
return x_train, y_train
#以上函数只对一个例子进行解析,但map函数可以传一个方法进去,对每个样本进行相同的操作
train_dataset = raw_train_dataset.map(parse_tf_example)
train_dataset
num_epochs = 10
train_ds = train_dataset.shuffle(buffer_size=10000).batch(2).repeat(num_epochs)#重复个10次~~ 简单训练一下
train_ds
for batch, (x, y) in enumerate(train_ds):
print(batch, x.shape, y)
#开始训练啦啊啦啊
model = tf.keras.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(2, activation='softmax')
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
model.fit(train_ds, epochs=num_epochs)