机器学习实战第二版---第八节:tensorflow数据预处理

数据预处理

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K

import numpy as np

X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)#在RAM中完全创建一个数据集 获取0-9的张量
for i in dataset:
    print(i)
print(dataset,'\n')
'''
链式转换
'''

dataset = dataset.repeat(3).batch(7,drop_remainder=True)#重复3遍,输出以七个元素为一组
for i in dataset:
    print(i)


# 调用map()方法来变换元素。例如,这将创建一个新数据集,其中所有元素均是原来的两倍
dataset = dataset.map(lambda x:x*2)  # Items: [0,2,4,6,8,10,12]
for i in dataset:
    print(i)


#使用多线程处理置num_parallel_calls参数一样简单
#map方法运用于整个元素
dataset = dataset.apply(tf.data.experimental.unbatch()) # Items: 0,2,4,...
for i in dataset:
    print(i)


# 用filter()方法简单地过滤数据集
#查看数据集中的一些元素,可以使用take()方法
dataset = dataset.filter(lambda x: x < 10) # Items: 0 2 4 6 8 0 2 4 6...
for i in dataset.take(3):#查看前三个
    print(i)


# 使用大小为5的缓冲区和42的随机种子进行
# 乱序
dataset = dataset.range(10).repeat(3).shuffle(buffer_size=5,seed=42).batch(7)
for i in dataset:
    print(i)

#预处理函数
n_inputs = 8
X_mean, X_std = [[111222]],[[111222]]
#进行特征缩放
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

'''
行预处理、随机乱序,可以选择
重复,并进行批处理
'''
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,n_read_threads=None, shuffle_buffer_size=10000,n_parse_threads=5, batch_size=32):
     dataset = tf.data.Dataset.list_files(filepaths)#遍历目录下的文件名
     dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),cycle_length=n_readers, num_parallel_calls=n_read_threads)#num_parallel_calls多线程
     #skip几个文件的第一行
     dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
     dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
     return dataset.batch(batch_size).prefetch(1)#预取一行



if __name__ == '__main__':
    dataset=csv_reader_dataset('./housing.csv')
    print(dataset)

预处理输入特征

# !/usr/bin/python
# -*- coding: utf-8 -*-
# @Time    : 2021/9/12 20:31
# @Author  : 郑浩鑫
# @Email   : [email protected]
# @File    : preprocess.py
# @Software: PyCharm
'''
13.3 预处理输入特征
'''
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import numpy as np
from sklearn.datasets import fetch_california_housing
from  sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def test():
    fashion_mnist = keras.datasets.fashion_mnist
    (X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()
    print(X_train_full.shape,X_test.dtype)
    '''
    创建验证集,将像素强度降低到0-1范围
    '''
    X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
    y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

    class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
                   "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
    print(class_names[y_train[100]])

    means = np.mean(X_train, axis=0, keepdims=True)
    stds = np.std(X_train, axis=0, keepdims=True)
    eps = keras.backend.epsilon()
    model = keras.models.Sequential([keras.layers.Lambda(lambda inputs: (inputs - means) / (stds + eps))])

'''
数据预处理
'''
class Standardization(keras.layers.Layer):
     def adapt(self, data_sample):
         self.means_ = np.mean(data_sample, axis=0, keepdims=True)
         self.stds_ = np.std(data_sample, axis=0, keepdims=True)

     def call(self, inputs):
         return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())



'''
独热编码
'''
def onehoencode():
    '''
    将每个类别映射到其索引(0到4)
    :return:
    '''
    vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]#定义词汇表
    indices = tf.range(len(vocab), dtype=tf.int64)#创建带有相应索引(0到4)的张量
    table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)#为查找表创建一个初始化程序,将类别列表及其对应的索引传递给它。
    num_oov_buckets = 2#用来存不在这个特征列表里的值
    table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

    #测试
    categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
    cat_indices = table.lookup(categories)
    cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets) #必须告诉索引的总数:len(vocab) + num_oov_buckets
    print(cat_one_hot)
'''
根据经验,如果类别数少于10,则通常采用独热编码方式。
(但数字可能会有所不同!)如果类别数大于50(通常这种情况需要使
用哈希桶),通常最好使用嵌入。在10到50个类别中,你可能需要尝试
两种方法,然后看看哪种最适合你
'''

'''
用嵌入编码分类特征
'''
def Embedded_coding():
    pass


if __name__ =='__main__':
    # fashion_mnist = keras.datasets.fashion_mnist
    # (X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()
    #
    # '''
    # 创建验证集,将像素强度降低到0-1范围
    # '''
    # X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
    # y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
    # '''
    # 数据预处理
    # '''
    # data_sample=X_train
    # std_layer = Standardization()
    # std_layer.adapt(data_sample)
    #
    # model = keras.Sequential()
    # model.add(std_layer)
    # # [...]  # create the rest of the model
    # # model.compile([...])
    # # model.fit([...])

    onehoencode()




TFRecord文件

# !/usr/bin/python
# -*- coding: utf-8 -*-
# @Time    : 2021/9/12 17:25
# @Author  : 郑浩鑫
# @Email   : [email protected]
# @File    : TFrecorda.py
# @Software: PyCharm

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K

# from tensorflow.train import BytesList, FloatList, Int64List
# from tensorflow.train import Feature, Features, Example

'''

TFRecord格式是TensorFlow首选的格式,用于存储大量数据并有效读取数据。这是一种非常简单的二进制格式,只
包含大小不同的二进制记录序列(每个记录由一个长度、一个用于检查长度是否损坏的CRC校验和、实际数据以及最后一个CRC校验和组成)。
'''
#创建一个数据集
def readTFRecord():
    with tf.io.TFRecordWriter("my_data.tfrecord") as f:
         f.write(b"This is the first record")
         f.write(b"And this is the second record")


    '''
    读取一个或多个TFRecord文件
    '''
    filepaths = ["my_data.tfrecord"]
    dataset = tf.data.TFRecordDataset(filepaths)
    for item in dataset:
        print(item)


'''
压缩的TFRecord文件
'''
def readTFRecord_zip():
    options = tf.io.TFRecordOptions(compression_type="GZIP")
    with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
        f.write(b"This is the first record")#b字节
        f.write(b"And this is the second record")


    dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],compression_type="GZIP")
    for item in dataset:
        print(item)







person_example = tf.train.Example(features=tf.train.Features(feature={"name": tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"Alice"])),"id": tf.train.Feature(int64_list=tf.train.Int64List(value=[123])),
 "emails": tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"[email protected]",
 b"[email protected]"]))
 }))#value就是要传的值

with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())#序列化

'''
加载和解析Example
固定长度特征被解析为规则张量,而可变长度特征被解析为稀疏张量
'''
feature_description = {
 "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
 "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
 "emails": tf.io.VarLenFeature(tf.string),
}
for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
     parsed_example = tf.io.parse_single_example(serialized_example,feature_description)#解析每个Example
     # 将稀疏张量转换为密集张量
     tf.sparse.to_dense(parsed_example["emails"], default_value=b"")
     print(parsed_example)


'''
一个批次一个批次来
'''
dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).batch(10)
for serialized_examples in dataset:
     parsed_examples = tf.io.parse_example(serialized_examples,feature_description)
     print(parsed_examples)


'''
SequenceExample Protobuf处理列表的列表
'''


你可能感兴趣的:(python,机器学习,tensorflow,深度学习,python)