数据预处理
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import numpy as np
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)#在RAM中完全创建一个数据集 获取0-9的张量
for i in dataset:
print(i)
print(dataset,'\n')
'''
链式转换
'''
dataset = dataset.repeat(3).batch(7,drop_remainder=True)#重复3遍,输出以七个元素为一组
for i in dataset:
print(i)
# 调用map()方法来变换元素。例如,这将创建一个新数据集,其中所有元素均是原来的两倍
dataset = dataset.map(lambda x:x*2) # Items: [0,2,4,6,8,10,12]
for i in dataset:
print(i)
#使用多线程处理置num_parallel_calls参数一样简单
#map方法运用于整个元素
dataset = dataset.apply(tf.data.experimental.unbatch()) # Items: 0,2,4,...
for i in dataset:
print(i)
# 用filter()方法简单地过滤数据集
#查看数据集中的一些元素,可以使用take()方法
dataset = dataset.filter(lambda x: x < 10) # Items: 0 2 4 6 8 0 2 4 6...
for i in dataset.take(3):#查看前三个
print(i)
# 使用大小为5的缓冲区和42的随机种子进行
# 乱序
dataset = dataset.range(10).repeat(3).shuffle(buffer_size=5,seed=42).batch(7)
for i in dataset:
print(i)
#预处理函数
n_inputs = 8
X_mean, X_std = [[111222]],[[111222]]
#进行特征缩放
def preprocess(line):
defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
fields = tf.io.decode_csv(line, record_defaults=defs)
x = tf.stack(fields[:-1])
y = tf.stack(fields[-1:])
return (x - X_mean) / X_std, y
'''
行预处理、随机乱序,可以选择
重复,并进行批处理
'''
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,n_read_threads=None, shuffle_buffer_size=10000,n_parse_threads=5, batch_size=32):
dataset = tf.data.Dataset.list_files(filepaths)#遍历目录下的文件名
dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),cycle_length=n_readers, num_parallel_calls=n_read_threads)#num_parallel_calls多线程
#skip几个文件的第一行
dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
return dataset.batch(batch_size).prefetch(1)#预取一行
if __name__ == '__main__':
dataset=csv_reader_dataset('./housing.csv')
print(dataset)
预处理输入特征
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2021/9/12 20:31
# @Author : 郑浩鑫
# @Email : [email protected]
# @File : preprocess.py
# @Software: PyCharm
'''
13.3 预处理输入特征
'''
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def test():
fashion_mnist = keras.datasets.fashion_mnist
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()
print(X_train_full.shape,X_test.dtype)
'''
创建验证集,将像素强度降低到0-1范围
'''
X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
"Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
print(class_names[y_train[100]])
means = np.mean(X_train, axis=0, keepdims=True)
stds = np.std(X_train, axis=0, keepdims=True)
eps = keras.backend.epsilon()
model = keras.models.Sequential([keras.layers.Lambda(lambda inputs: (inputs - means) / (stds + eps))])
'''
数据预处理
'''
class Standardization(keras.layers.Layer):
def adapt(self, data_sample):
self.means_ = np.mean(data_sample, axis=0, keepdims=True)
self.stds_ = np.std(data_sample, axis=0, keepdims=True)
def call(self, inputs):
return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())
'''
独热编码
'''
def onehoencode():
'''
将每个类别映射到其索引(0到4)
:return:
'''
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]#定义词汇表
indices = tf.range(len(vocab), dtype=tf.int64)#创建带有相应索引(0到4)的张量
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)#为查找表创建一个初始化程序,将类别列表及其对应的索引传递给它。
num_oov_buckets = 2#用来存不在这个特征列表里的值
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
#测试
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets) #必须告诉索引的总数:len(vocab) + num_oov_buckets
print(cat_one_hot)
'''
根据经验,如果类别数少于10,则通常采用独热编码方式。
(但数字可能会有所不同!)如果类别数大于50(通常这种情况需要使
用哈希桶),通常最好使用嵌入。在10到50个类别中,你可能需要尝试
两种方法,然后看看哪种最适合你
'''
'''
用嵌入编码分类特征
'''
def Embedded_coding():
pass
if __name__ =='__main__':
# fashion_mnist = keras.datasets.fashion_mnist
# (X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()
#
# '''
# 创建验证集,将像素强度降低到0-1范围
# '''
# X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
# y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
# '''
# 数据预处理
# '''
# data_sample=X_train
# std_layer = Standardization()
# std_layer.adapt(data_sample)
#
# model = keras.Sequential()
# model.add(std_layer)
# # [...] # create the rest of the model
# # model.compile([...])
# # model.fit([...])
onehoencode()
TFRecord文件
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2021/9/12 17:25
# @Author : 郑浩鑫
# @Email : [email protected]
# @File : TFrecorda.py
# @Software: PyCharm
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
# from tensorflow.train import BytesList, FloatList, Int64List
# from tensorflow.train import Feature, Features, Example
'''
TFRecord格式是TensorFlow首选的格式,用于存储大量数据并有效读取数据。这是一种非常简单的二进制格式,只
包含大小不同的二进制记录序列(每个记录由一个长度、一个用于检查长度是否损坏的CRC校验和、实际数据以及最后一个CRC校验和组成)。
'''
#创建一个数据集
def readTFRecord():
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
f.write(b"This is the first record")
f.write(b"And this is the second record")
'''
读取一个或多个TFRecord文件
'''
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
print(item)
'''
压缩的TFRecord文件
'''
def readTFRecord_zip():
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
f.write(b"This is the first record")#b字节
f.write(b"And this is the second record")
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],compression_type="GZIP")
for item in dataset:
print(item)
person_example = tf.train.Example(features=tf.train.Features(feature={"name": tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"Alice"])),"id": tf.train.Feature(int64_list=tf.train.Int64List(value=[123])),
"emails": tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"[email protected]",
b"[email protected]"]))
}))#value就是要传的值
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
f.write(person_example.SerializeToString())#序列化
'''
加载和解析Example
固定长度特征被解析为规则张量,而可变长度特征被解析为稀疏张量
'''
feature_description = {
"name": tf.io.FixedLenFeature([], tf.string, default_value=""),
"id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
"emails": tf.io.VarLenFeature(tf.string),
}
for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
parsed_example = tf.io.parse_single_example(serialized_example,feature_description)#解析每个Example
# 将稀疏张量转换为密集张量
tf.sparse.to_dense(parsed_example["emails"], default_value=b"")
print(parsed_example)
'''
一个批次一个批次来
'''
dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).batch(10)
for serialized_examples in dataset:
parsed_examples = tf.io.parse_example(serialized_examples,feature_description)
print(parsed_examples)
'''
SequenceExample Protobuf处理列表的列表
'''