TFT(tensorflow_transform)中使用tensorflow_transform.beam的预处理操作

目录

  • 下载数据集
  • 定义数据集常量
  • 转换example函数
  • 预处理常量
  • 预处理
  • 查看转换后数据
  • 模型建立、训练、评估
  • 测试预处理函数
  • 测试tft函数(可直接使用的)

import math
import os
import pprint
import tempfile
import pathlib

import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import apache_beam as beam
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam

from tfx_bsl.public import tfxio
from tfx_bsl.coders.example_coder import RecordBatchToExamplesEncoder
2023-06-26 23:55:49.730378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 23:55:50.561378: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:55:50.561477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:55:50.561486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.

下载数据集

import urllib
train_url_path = 'https://storage.googleapis.com/artifacts.tfx-oss-public.appspot.com/datasets/census/adult.data'
test_url_path = 'https://storage.googleapis.com/artifacts.tfx-oss-public.appspot.com/datasets/census/adult.test'
train_path = os.path.join('./test3','adult.data')
test_path  = os.path.join('./test3/','adult.test')
urllib.request.urlretrieve(train_url_path,train_path)
urllib.request.urlretrieve(test_url_path,test_path)
('./test3/adult.test', )

定义数据集常量

CATEGORICAL_FEATURE_KEYS = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
]

NUMERIC_FEATURE_KEYS = [
    'age',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'education-num'
]

ORDERED_CSV_COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label'
]

LABEL_KEY = 'label'
pandas_train = pd.read_csv(train_path,header=None,names=ORDERED_CSV_COLUMNS)
pandas_train.head(5)
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country label
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
COLUMN_DEFAULTS = [
  '' if isinstance(v, str) else 0.0
  for v in  dict(pandas_train.loc[1]).values()]
COLUMN_DEFAULTS
[0.0, '', 0.0, '', 0.0, '', '', '', '', '', 0.0, 0.0, 0.0, '', '']
pandas_test = pd.read_csv(test_path, header=1, names=ORDERED_CSV_COLUMNS)

pandas_test.head(5)
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country label
0 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K.
1 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K.
2 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K.
3 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States <=50K.
4 34 Private 198693 10th 6 Never-married Other-service Not-in-family White Male 0 0 30 United-States <=50K.
RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string))
     for name in CATEGORICAL_FEATURE_KEYS] +
    [(name, tf.io.FixedLenFeature([], tf.float32))
     for name in NUMERIC_FEATURE_KEYS] + 
    [(LABEL_KEY, tf.io.FixedLenFeature([], tf.string))]
)
#如下,使用手动feature_spec获得Schema,或者也可通过tfdv.infer_schema自动推断
SCHEMA = tft.DatasetMetadata.from_feature_spec(RAW_DATA_FEATURE_SPEC).schema

转换example函数

from typing import List,Union,Optional
from tensorflow_metadata.proto.v0 import schema_pb2
import pandas as pd
import tensorflow_data_validation as tfdv
import numpy as np
def create_example_by_schema_from_dataframe(row:pd.Series,column_names:List[str],schema_or_schemapath:Union[str,schema_pb2.Schema]):
    """
    根据数据原来的Schema信息将输入的一行数据转换为序列化后的example
    input:
        row:类型为pd.Series的一行数据
        column_names:类型为列表,包含需要转换的列名
        schema_or_schemapath:数据的Schema实例或者Schema的路径(需要具体到schema.pbtxt)
    output:
        example:example数据
    """
    features = {}
    if isinstance(schema_or_schemapath,str):
        schema_or_schemapath=tfdv.load_schema_text(schema_or_schemapath)
    for columnName in column_names:
        typeCode = tfdv.get_feature(schema_or_schemapath,columnName).type
        tempvalue = None
        if typeCode == 1: #string
            if pd.isna(row[columnName]):
                tempvalue = b''
            else:
                tempvalue = row[columnName].encode()
            features[columnName] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[tempvalue]))
        elif typeCode == 2: #int
            if pd.isna(row[columnName]):
                tempvalue = 0
            else:
                tempvalue = int(row[columnName])
            features[columnName] = tf.train.Feature(int64_list=tf.train.Int64List(value=[tempvalue]))
        elif typeCode == 3: #float
            if pd.isna(row[columnName]):
                tempvalue = 0.0
            else:
                tempvalue = float(row[columnName])
            features[columnName] = tf.train.Feature(float_list=tf.train.FloatList(value=[tempvalue]))
    example_proto = tf.train.Example(features=tf.train.Features(feature=features))
    return example_proto

def create_feature_spec_by_schema_from_columnnames(column_names:List[str],schema_or_schemapath:Union[str,schema_pb2.Schema],all_num_to_float:bool=False):
    """
    根据Schema从需要的列中获得它的Feature_spec
    input:
        column_names:类型为列表,包含需要的列名
        schema_or_schemapath:数据的Schema实例或者Schema的路径(需要具体到schema.pbtxt)
        all_num_to_float:是否将所有的数值类型设为tf.float64
    output:
        output_dict:获得的feature_spec
    """
    output_dict={}
    if isinstance(schema_or_schemapath,str):
        schema_or_schemapath=tfdv.load_schema_text(schema_or_schemapath)
    for columnName in column_names:
        typeCode = tfdv.get_feature(schema_or_schemapath,columnName).type
        if typeCode == 1: #string
            output_dict[columnName] = tf.io.FixedLenFeature([],tf.string)
        elif typeCode == 2: #int
            if all_num_to_float:
                output_dict[columnName] = tf.io.FixedLenFeature([], tf.float32)
            else:
                output_dict[columnName] = tf.io.FixedLenFeature([], tf.int64)
        elif typeCode == 3: #float
            output_dict[columnName] = tf.io.FixedLenFeature([], tf.float32)
    return output_dict
tf_example = create_example_by_schema_from_dataframe(pandas_train.loc[0],NUMERIC_FEATURE_KEYS+CATEGORICAL_FEATURE_KEYS,SCHEMA)
tf_example.features.feature['age']
float_list {
  value: 39.0
}
created_feature_spec=create_feature_spec_by_schema_from_columnnames(NUMERIC_FEATURE_KEYS+CATEGORICAL_FEATURE_KEYS,SCHEMA)
created_feature_spec
{'age': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-gain': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-loss': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'hours-per-week': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'education-num': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'workclass': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'education': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'marital-status': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'occupation': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'relationship': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'race': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'sex': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'native-country': FixedLenFeature(shape=[], dtype=tf.string, default_value=None)}
decoded_tensor = tf.io.parse_single_example(
    tf_example.SerializeToString(),
    features=created_feature_spec
)
decoded_tensor
2023-06-26 23:55:59.068527: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 23:55:59.099139: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:55:59.099171: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-26 23:55:59.099783: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.





{'age': ,
 'capital-gain': ,
 'capital-loss': ,
 'education': ,
 'education-num': ,
 'hours-per-week': ,
 'marital-status': ,
 'native-country': ,
 'occupation': ,
 'race': ,
 'relationship': ,
 'sex': ,
 'workclass': }
'marital-status' in tf_example.features.feature.keys()
True

预处理常量

NUM_OOV_BUCKETS = 1

EPOCH_SPLITS = 10
TRAIN_NUM_EPOCHS = 2*EPOCH_SPLITS
NUM_TRAIN_INSTANCES = len(pandas_train)
NUM_TEST_INSTANCES = len(pandas_test)

BATCH_SIZE = 128

STEPS_PER_TRAIN_EPOCH = tf.math.ceil(NUM_TRAIN_INSTANCES/BATCH_SIZE/EPOCH_SPLITS)
EVALUATION_STEPS = tf.math.ceil(NUM_TEST_INSTANCES/BATCH_SIZE)

# Names of temp files
TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed'
TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
EXPORTED_MODEL_DIR = 'exported_model_dir'

预处理

def preprocessing_fn(inputs):
    outputs = inputs.copy()
    
    for key in NUMERIC_FEATURE_KEYS:
        outputs[key] = tft.scale_to_0_1(inputs[key])
    
    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tft.compute_and_apply_vocabulary(
            tf.strings.strip(inputs[key]),
            num_oov_buckets=NUM_OOV_BUCKETS,
            vocab_filename=key
        )
        
    table_keys = ['>50K','<=50K']
    with tf.init_scope():
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)),tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64
        )
        table = tf.lookup.StaticHashTable(initializer,default_value=-1)
    label_str = inputs[LABEL_KEY]
    label_str = tf.strings.regex_replace(label_str,r'\.$','')
    label_str = tf.strings.strip(label_str)
    data_labels = table.lookup(label_str)
    transformed_label = tf.one_hot(
        indices=data_labels,depth=len(table_keys),on_value=1.0,off_value=0.0
    )
    outputs[LABEL_KEY] = tf.reshape(transformed_label, [-1, len(table_keys)])
    
    return outputs
    
def transform_data(train_data_file,test_data_file,working_dir):
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            
            #读入数据
            train_csv_tfxio = tfxio.CsvTFXIO(
                file_pattern=train_data_file,
                telemetry_descriptors=[],
                column_names=ORDERED_CSV_COLUMNS,
                schema=SCHEMA
            )
            raw_data = (
                pipeline |
                'ReadTrainCsv' >> train_csv_tfxio.BeamSource()
            )
            
            #设置元数据
            cfg = train_csv_tfxio.TensorAdapterConfig()
            raw_dataset = (raw_data,cfg)
            
            #转换
            transformed_dataset, transform_fn = (
                raw_dataset | tft_beam.AnalyzeAndTransformDataset(
                    preprocessing_fn,output_record_batches=True)
            )
            
            #写入成examples
            transformed_data, _ = transformed_dataset
            coder = RecordBatchToExamplesEncoder()
            _ = (
                transformed_data
                | 'EncodeTrainData' >>
                beam.FlatMapTuple(lambda batch, _: coder.encode(batch))  #转换后数据编码为examples
                | 'WriteTrainData' >> beam.io.WriteToTFRecord(           #写入examples数据
                    os.path.join(working_dir,TRANSFORMED_TRAIN_DATA_FILEBASE))
            )
            
            #下面一样
            test_csv_tfxio = tfxio.CsvTFXIO(
                file_pattern=test_data_file,
                skip_header_lines=1,
                telemetry_descriptors=[],
                column_names=ORDERED_CSV_COLUMNS,
                schema=SCHEMA
            )
            raw_test_data = (
                pipeline | 'ReadTestCsv' >> test_csv_tfxio.BeamSource()
            )
            raw_test_dataset = (raw_test_data,test_csv_tfxio.TensorAdapterConfig())
            
            #应用上面得到的转换
            transformed_test_dataset = (
                (raw_test_dataset, transform_fn)
                | tft_beam.TransformDataset(output_record_batches=True)
            )
            transformed_test_data, _ = transformed_test_dataset
            
            _ = (
                transformed_test_data
                | 'EncodeTestData' >>
                beam.FlatMapTuple(lambda batch, _ :coder.encode(batch))
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir,TRANSFORMED_TEST_DATA_FILEBASE))
            )
            
            _ = (
                transform_fn
                | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)
            )

查看转换后数据

output_dir = os.path.join(tempfile.mkdtemp(), 'keras')
transform_data(train_path, test_path, output_dir)
WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.




WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/tf_utils.py:324: Tensor.experimental_ref (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use ref() instead.


WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/tf_utils.py:324: Tensor.experimental_ref (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use ref() instead.


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/631aea06548b4fb38aaa5e855fd97e70/assets


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/631aea06548b4fb38aaa5e855fd97e70/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/2c585badca494ed8939abc241f42206a/assets


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/2c585badca494ed8939abc241f42206a/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
WARNING:apache_beam.io.tfrecordio:Couldn't find python-snappy so the implementation of _TFRecordUtil._masked_crc32c is not as fast as it could be.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
!ls {output_dir}
test_transformed-00000-of-00001   transform_fn
train_transformed-00000-of-00001  transformed_metadata

a

#这里label,shape变为[2]
tf_transform_output = tft.TFTransformOutput(output_dir)
tf_transform_output.transformed_feature_spec()
{'age': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-gain': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-loss': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'education': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'education-num': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'hours-per-week': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'label': FixedLenFeature(shape=[2], dtype=tf.float32, default_value=None),
 'marital-status': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'native-country': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'occupation': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'race': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'relationship': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'sex': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'workclass': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}
def _make_training_input_fn(tf_transform_output, train_file_pattern,
                            batch_size):
  """An input function reading from transformed data, converting to model input.

  Args:
    tf_transform_output: Wrapper around output of tf.Transform.
    transformed_examples: Base filename of examples.
    batch_size: Batch size.

  Returns:
    The input data for training or eval, in the form of k.
  """
  def input_fn():
    return tf.data.experimental.make_batched_features_dataset( #生成字典数据,且label和inputs分开
        file_pattern=train_file_pattern,
        batch_size=batch_size,
        features=tf_transform_output.transformed_feature_spec(),
        reader=tf.data.TFRecordDataset,
        label_key=LABEL_KEY,
        shuffle=True)

  return input_fn
train_file_pattern = pathlib.Path(output_dir)/f'{TRANSFORMED_TRAIN_DATA_FILEBASE}*'

input_fn = _make_training_input_fn(
    tf_transform_output=tf_transform_output,
    train_file_pattern = str(train_file_pattern),
    batch_size = 10
)
for example, label in input_fn().take(5):
  break

pd.DataFrame(example)
age capital-gain capital-loss education education-num hours-per-week marital-status native-country occupation race relationship sex workclass
0 0.150685 0.000000 0.000000 2 0.800000 0.500000 0 0 4 0 0 0 0
1 0.000000 0.010550 0.000000 10 0.266667 0.234694 1 0 5 0 2 0 0
2 0.397260 0.000000 0.000000 0 0.533333 0.377551 2 0 6 0 1 1 0
3 0.068493 0.000000 0.000000 1 0.600000 0.397959 1 0 8 0 1 0 0
4 0.287671 0.000000 0.453857 9 0.933333 0.500000 0 0 0 0 0 0 6
5 0.027397 0.000000 0.000000 0 0.533333 0.397959 1 0 6 1 2 1 0
6 0.191781 0.000000 0.000000 2 0.800000 0.397959 0 0 0 0 4 1 0
7 0.657534 0.200512 0.000000 3 0.866667 0.397959 0 0 4 0 0 0 0
8 0.534247 0.000000 0.000000 2 0.800000 0.397959 4 0 5 0 1 1 0
9 0.315068 0.000000 0.000000 2 0.800000 0.367347 1 0 5 0 1 0 0
label

模型建立、训练、评估

def build_keras_inputs(working_dir):
    tf_transform_output = tft.TFTransformOutput(working_dir)
    feature_spec = tf_transform_output.transformed_feature_spec().copy()
    feature_spec.pop(LABEL_KEY)
    
    inputs={}
    for key, spec in feature_spec.items():
        if isinstance(spec,tf.io.VarLenFeature):
            inputs[key] = tf.keras.layers.Input(
                shape=[None],name=key,dtype=spec.dtype,sparse=True)
        elif isinstance(spec,tf.io.FixedLenFeature):
            inputs[key] = tf.keras.layers.Input(
                shape=spec.shape,name=key,dtype=spec.dtype)
        else:
            raise ValueError('Spec type is not supported:',key,spec)
    return inputs

def encode_inputs(inputs):
    encoded_inputs = {}
    for key in inputs:
        feature = tf.expand_dims(inputs[key],-1)
        if key in CATEGORICAL_FEATURE_KEYS:
            num_buckets = tf_transform_output.num_buckets_for_transformed_feature(key)
            encoding_layer = (
                tf.keras.layers.CategoryEncoding(
                    num_tokens=num_buckets,output_mode='binary',sparse=False))
            encoded_inputs[key] = encoding_layer(feature)
        else:
            encoded_inputs[key] = feature
    return encoded_inputs

def build_keras_model(working_dir):
    inputs = build_keras_inputs(working_dir)
    encoded_inputs = encode_inputs(inputs)
    
    stacked_inputs = tf.concat(tf.nest.flatten(encoded_inputs),axis=1)
    output = tf.keras.layers.Dense(100,activation='relu')(stacked_inputs)
    output = tf.keras.layers.Dense(50,activation='relu')(output)
    output = tf.keras.layers.Dense(2)(output)
    model = tf.keras.Model(inputs=inputs,outputs=output)
    return model
model = build_keras_model(output_dir)

tf.keras.utils.plot_model(model,rankdir='LR', show_shapes=True)

TFT(tensorflow_transform)中使用tensorflow_transform.beam的预处理操作_第1张图片

def get_dataset(working_dir, filebase):
  tf_transform_output = tft.TFTransformOutput(working_dir)

  data_path_pattern = os.path.join(
      working_dir,
      filebase + '*')

  input_fn = _make_training_input_fn(
      tf_transform_output,
      data_path_pattern,
      batch_size=BATCH_SIZE)

  dataset = input_fn()

  return dataset
def train_model(model, train_dataset, validation_dataset):
  model.compile(optimizer='adam',
                loss=tf.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

  history = model.fit(train_dataset, validation_data=validation_dataset,
      epochs=TRAIN_NUM_EPOCHS,
      steps_per_epoch=STEPS_PER_TRAIN_EPOCH,
      validation_steps=EVALUATION_STEPS)
  return history

def train_and_evaluate(
    model,
    working_dir):
  """Train the model on training data and evaluate on test data.

  Args:
    working_dir: The location of the Transform output.
    num_train_instances: Number of instances in train set
    num_test_instances: Number of instances in test set

  Returns:
    The results from the estimator's 'evaluate' method
  """
  train_dataset = get_dataset(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)
  validation_dataset = get_dataset(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)

  model = build_keras_model(working_dir)

  history = train_model(model, train_dataset, validation_dataset)

  metric_values = model.evaluate(validation_dataset,
                                 steps=EVALUATION_STEPS,
                                 return_dict=True)
  return model, history, metric_values
model, history, metric_values = train_and_evaluate(model, output_dir)
Epoch 1/20
26/26 [==============================] - 2s 30ms/step - loss: 0.5085 - accuracy: 0.7623 - val_loss: 0.4191 - val_accuracy: 0.7864
Epoch 2/20
26/26 [==============================] - 0s 18ms/step - loss: 0.4029 - accuracy: 0.8092 - val_loss: 0.3704 - val_accuracy: 0.8270
Epoch 3/20
26/26 [==============================] - 1s 22ms/step - loss: 0.3721 - accuracy: 0.8302 - val_loss: 0.3596 - val_accuracy: 0.8329
Epoch 4/20
26/26 [==============================] - 1s 21ms/step - loss: 0.3517 - accuracy: 0.8332 - val_loss: 0.3527 - val_accuracy: 0.8349
Epoch 5/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3494 - accuracy: 0.8389 - val_loss: 0.3467 - val_accuracy: 0.8388
Epoch 6/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3401 - accuracy: 0.8425 - val_loss: 0.3477 - val_accuracy: 0.8396
Epoch 7/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3461 - accuracy: 0.8419 - val_loss: 0.3444 - val_accuracy: 0.8395
Epoch 8/20
26/26 [==============================] - 0s 20ms/step - loss: 0.3570 - accuracy: 0.8305 - val_loss: 0.3427 - val_accuracy: 0.8417
Epoch 9/20
26/26 [==============================] - 1s 24ms/step - loss: 0.3387 - accuracy: 0.8447 - val_loss: 0.3406 - val_accuracy: 0.8420
Epoch 10/20
26/26 [==============================] - 1s 20ms/step - loss: 0.3419 - accuracy: 0.8431 - val_loss: 0.3386 - val_accuracy: 0.8433
Epoch 11/20
26/26 [==============================] - 1s 23ms/step - loss: 0.3423 - accuracy: 0.8401 - val_loss: 0.3391 - val_accuracy: 0.8422
Epoch 12/20
26/26 [==============================] - 1s 25ms/step - loss: 0.3364 - accuracy: 0.8459 - val_loss: 0.3360 - val_accuracy: 0.8442
Epoch 13/20
26/26 [==============================] - 1s 20ms/step - loss: 0.3325 - accuracy: 0.8543 - val_loss: 0.3350 - val_accuracy: 0.8436
Epoch 14/20
26/26 [==============================] - 1s 23ms/step - loss: 0.3287 - accuracy: 0.8474 - val_loss: 0.3334 - val_accuracy: 0.8441
Epoch 15/20
26/26 [==============================] - 0s 20ms/step - loss: 0.3395 - accuracy: 0.8404 - val_loss: 0.3348 - val_accuracy: 0.8434
Epoch 16/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3307 - accuracy: 0.8537 - val_loss: 0.3352 - val_accuracy: 0.8435
Epoch 17/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3197 - accuracy: 0.8483 - val_loss: 0.3336 - val_accuracy: 0.8443
Epoch 18/20
26/26 [==============================] - 1s 22ms/step - loss: 0.3479 - accuracy: 0.8332 - val_loss: 0.3303 - val_accuracy: 0.8455
Epoch 19/20
26/26 [==============================] - 1s 22ms/step - loss: 0.3300 - accuracy: 0.8459 - val_loss: 0.3343 - val_accuracy: 0.8411
Epoch 20/20
26/26 [==============================] - 1s 20ms/step - loss: 0.3393 - accuracy: 0.8374 - val_loss: 0.3311 - val_accuracy: 0.8438
128/128 [==============================] - 1s 2ms/step - loss: 0.3304 - accuracy: 0.8441
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Eval')
plt.ylim(0,max(plt.ylim()))
plt.legend()
plt.title('Loss');

TFT(tensorflow_transform)中使用tensorflow_transform.beam的预处理操作_第2张图片

测试预处理函数

def read_csv(file_name, batch_size):
  return tf.data.experimental.make_csv_dataset(  #生成字典数据
        file_pattern=file_name,
        batch_size=batch_size,
        column_names=ORDERED_CSV_COLUMNS,
        column_defaults=COLUMN_DEFAULTS,
        prefetch_buffer_size=0,
        ignore_errors=True)
for ex in read_csv(test_path, batch_size=5):
  break

ex2 = ex.copy()
ex2.pop('fnlwgt')

tft_layer = tf_transform_output.transform_features_layer()
t_ex = tft_layer(ex2)

label = t_ex.pop(LABEL_KEY)
pd.DataFrame(t_ex)
WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/data/experimental/ops/readers.py:572: ignore_errors (from tensorflow.python.data.experimental.ops.error_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/data/experimental/ops/readers.py:572: ignore_errors (from tensorflow.python.data.experimental.ops.error_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
sex marital-status occupation native-country relationship education workclass capital-gain education-num hours-per-week capital-loss race age
0 1 0 3 0 4 2 0 0.0 0.800000 0.397959 0.0 0 0.109589
1 1 1 5 0 3 4 0 0.0 0.666667 0.091837 0.0 0 0.219178
2 0 1 7 1 1 13 3 0.0 0.133333 0.244898 0.0 0 0.041096
3 1 1 0 0 1 3 0 0.0 0.866667 0.397959 0.0 0 0.520548
4 0 1 7 0 1 2 3 0.0 0.800000 0.346939 0.0 0 0.082192
pdex2=pd.DataFrame(ex2)
pdex2
age workclass education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country label
0 25.0 b' Private' b' Bachelors' 13.0 b' Married-civ-spouse' b' Adm-clerical' b' Wife' b' White' b' Female' 0.0 0.0 40.0 b' United-States' b' <=50K.'
1 33.0 b' Private' b' Assoc-voc' 11.0 b' Never-married' b' Other-service' b' Unmarried' b' White' b' Female' 0.0 0.0 10.0 b' United-States' b' <=50K.'
2 20.0 b' ?' b' 5th-6th' 3.0 b' Never-married' b' ?' b' Not-in-family' b' White' b' Male' 0.0 0.0 25.0 b' Mexico' b' <=50K.'
3 55.0 b' Private' b' Masters' 14.0 b' Never-married' b' Prof-specialty' b' Not-in-family' b' White' b' Female' 0.0 0.0 40.0 b' United-States' b' <=50K.'
4 23.0 b' ?' b' Bachelors' 13.0 b' Never-married' b' ?' b' Not-in-family' b' White' b' Male' 0.0 0.0 35.0 b' United-States' b' <=50K.'
pdex2['age']
0    25.0
1    33.0
2    20.0
3    55.0
4    23.0
Name: age, dtype: float32

测试tft函数(可直接使用的)

tft.apply_buckets(pdex2['age'],[[10,20,30,40,50,60]])

tft.apply_buckets_with_interpolation(tf.constant(pdex2['age'],dtype=tf.float32),[[10,20,30,40,50,60]])

sparse = tf.SparseTensor(indices=[[0, 0], [0, 1], [2, 2]],
                         values=['a', 'b', 'c'], dense_shape=(4, 4))
sparse
SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [2 2]], shape=(3, 2), dtype=int64), values=tf.Tensor([b'a' b'b' b'c'], shape=(3,), dtype=string), dense_shape=tf.Tensor([4 4], shape=(2,), dtype=int64))
tft.bag_of_words(sparse,(1,2),' ')
WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/mappers.py:1396: calling while_loop_v2 (from tensorflow.python.ops.control_flow_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/mappers.py:1396: calling while_loop_v2 (from tensorflow.python.ops.control_flow_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))





SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [0 2]
 [2 0]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a' b'a b' b'b' b'c'], shape=(4,), dtype=string), dense_shape=tf.Tensor([4 3], shape=(2,), dtype=int64))
tft.ngrams(sparse,(1,2),' ')
SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [0 2]
 [2 0]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a' b'a b' b'b' b'c'], shape=(4,), dtype=string), dense_shape=tf.Tensor([4 3], shape=(2,), dtype=int64))
tft.deduplicate_tensor_per_row(tf.constant([[1,1,2],[2,3,2]])) #tf.sparse.to_dense(tft.deduplicate_tensor_per_row(tf.constant([[1,1,2],[2,3,2]])))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([1 2 2 3], shape=(4,), dtype=int32), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64))
tft.hash_strings(tf.constant(pdex2['sex'],dtype=tf.string),3)

tft.word_count(sparse)

你可能感兴趣的:(tfx,tensorflow,python,人工智能)