

  • Download example data
  • ExampleGen
  • StatisticsGen
  • SchemaGen
  • ExampleValidator
  • Transform


!pip uninstall shapely -y
WARNING: Skipping shapely as it is not installed
import os
import pprint
import tempfile
import shutil 
import urllib

import absl
import tensorflow as tf
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip
2023-06-27 11:00:21.593290: I tensorflow/core/platform/] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-27 11:00:22.444429: W tensorflow/compiler/tf2tensorrt/utils/] TF-TRT Warning: Could not find TensorRT
WARNING:absl:Failed to import tensorflow serving protos. It can fail if the TF version doesn't match with the TF Serving version. We will try importing again with a workaround:module 'tensorflow.core.protobuf.error_codes_pb2' has no attribute '_CODE'
print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(tfx.__version__))
TensorFlow version: 2.12.0
TFX version: 1.13.0
# This is the root directory for your TFX pip package installation.
_tfx_root = tfx.__path__[0]

# This is the directory containing the TFX Chicago Taxi Pipeline example.
_taxi_root = os.path.join(_tfx_root, 'examples/chicago_taxi_pipeline')

# This is the path where your model will be pushed for serving.
_serving_model_dir = os.path.join(
    '.', 'serving_model/taxi_simple')

PIPELINE_NAME = "chicago_pipeline"
# Output directory to store artifacts generated from the pipeline.
PIPELINE_ROOT = os.path.join('./pipelines', PIPELINE_NAME)
# Path to a SQLite DB file to use as an MLMD storage.
METADATA_PATH = os.path.join('./metadata', PIPELINE_NAME, 'metadata.db')

# Set up logging.

Download example data

_data_root = './tfx-data'
_data_filepath = os.path.join(_data_root, "data.csv")
#urllib.request.urlretrieve(DATA_PATH, _data_filepath)
pipeline_output_root = './pipeline_output_root'
if len(os.listdir(pipeline_output_root)) > 0:
context = InteractiveContext(pipeline_root=pipeline_output_root)
WARNING:absl:InteractiveContext metadata_connection_config not provided: using SQLite ML Metadata database at ./pipeline_output_root/metadata.sqlite.


example_gen = tfx.components.CsvExampleGen(input_base=_data_root)
INFO:absl:Running driver for CsvExampleGen
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:select span and version = (0, None)
INFO:absl:latest span and version = (0, None)
INFO:absl:Running executor for CsvExampleGen
INFO:absl:Generating examples.
WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.

INFO:absl:Processing input csv data ./tfx-data/* to TFExample.'t find python-snappy so the implementation of _TFRecordUtil._masked_crc32c is not as fast as it could be.
INFO:absl:Examples generated.
INFO:absl:Running publisher for CsvExampleGen
INFO:absl:MetadataStore with DB connection initialized
ExecutionResult at 0x7f030868e310
.execution_id 1

We can also take a look at the first three training examples:

# Get the URI of the output artifact representing the training examples, which is a directory
train_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset =, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
features {
  feature {
    key: "company"
    value {
      bytes_list {
        value: "Chicago Elite Cab Corp. (Chicago Carriag"
  feature {
    key: "dropoff_census_tract"
    value {
      int64_list {
  feature {
    key: "dropoff_community_area"
    value {
      int64_list {
  feature {
    key: "dropoff_latitude"
    value {
      float_list {
  feature {
    key: "dropoff_longitude"
    value {
      float_list {
  feature {
    key: "fare"
    value {
      float_list {
        value: 12.449999809265137
  feature {
    key: "payment_type"
    value {
      bytes_list {
        value: "Credit Card"
  feature {
    key: "pickup_census_tract"
    value {
      int64_list {
  feature {
    key: "pickup_community_area"
    value {
      int64_list {
  feature {
    key: "pickup_latitude"
    value {
      float_list {
  feature {
    key: "pickup_longitude"
    value {
      float_list {
  feature {
    key: "tips"
    value {
      float_list {
        value: 0.0
  feature {
    key: "trip_miles"
    value {
      float_list {
        value: 0.0
  feature {
    key: "trip_seconds"
    value {
      int64_list {
        value: 0
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 6
  feature {
    key: "trip_start_hour"
    value {
      int64_list {
        value: 19
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 5
  feature {
    key: "trip_start_timestamp"
    value {
      int64_list {
        value: 1400269500

features {
  feature {
    key: "company"
    value {
      bytes_list {
        value: "Taxi Affiliation Services"
  feature {
    key: "dropoff_census_tract"
    value {
      int64_list {
  feature {
    key: "dropoff_community_area"
    value {
      int64_list {
  feature {
    key: "dropoff_latitude"
    value {
      float_list {
  feature {
    key: "dropoff_longitude"
    value {
      float_list {
  feature {
    key: "fare"
    value {
      float_list {
        value: 27.049999237060547
  feature {
    key: "payment_type"
    value {
      bytes_list {
        value: "Cash"
  feature {
    key: "pickup_census_tract"
    value {
      int64_list {
  feature {
    key: "pickup_community_area"
    value {
      int64_list {
        value: 60
  feature {
    key: "pickup_latitude"
    value {
      float_list {
        value: 41.836151123046875
  feature {
    key: "pickup_longitude"
    value {
      float_list {
        value: -87.64878845214844
  feature {
    key: "tips"
    value {
      float_list {
        value: 0.0
  feature {
    key: "trip_miles"
    value {
      float_list {
        value: 12.600000381469727
  feature {
    key: "trip_seconds"
    value {
      int64_list {
        value: 1380
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 3
  feature {
    key: "trip_start_hour"
    value {
      int64_list {
        value: 2
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 10
  feature {
    key: "trip_start_timestamp"
    value {
      int64_list {
        value: 1380593700

features {
  feature {
    key: "company"
    value {
      bytes_list {
  feature {
    key: "dropoff_census_tract"
    value {
      int64_list {
  feature {
    key: "dropoff_community_area"
    value {
      int64_list {
  feature {
    key: "dropoff_latitude"
    value {
      float_list {
  feature {
    key: "dropoff_longitude"
    value {
      float_list {
  feature {
    key: "fare"
    value {
      float_list {
        value: 16.450000762939453
  feature {
    key: "payment_type"
    value {
      bytes_list {
        value: "Cash"
  feature {
    key: "pickup_census_tract"
    value {
      int64_list {
  feature {
    key: "pickup_community_area"
    value {
      int64_list {
        value: 13
  feature {
    key: "pickup_latitude"
    value {
      float_list {
        value: 41.98363494873047
  feature {
    key: "pickup_longitude"
    value {
      float_list {
        value: -87.72357940673828
  feature {
    key: "tips"
    value {
      float_list {
        value: 0.0
  feature {
    key: "trip_miles"
    value {
      float_list {
        value: 6.900000095367432
  feature {
    key: "trip_seconds"
    value {
      int64_list {
        value: 780
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 3
  feature {
    key: "trip_start_hour"
    value {
      int64_list {
        value: 12
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 11
  feature {
    key: "trip_start_timestamp"
    value {
      int64_list {
        value: 1446554700

2023-06-27 11:00:33.035908: I tensorflow/compiler/xla/stream_executor/cuda/] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-27 11:00:33.068969: W tensorflow/core/common_runtime/gpu/] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-27 11:00:33.108659: I tensorflow/core/common_runtime/] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


statistics_gen = tfx.components.StatisticsGen(
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Running driver for StatisticsGen
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:Running executor for StatisticsGen
INFO:absl:Generating statistics for split train.
INFO:absl:Statistics for split train written to ./pipeline_output_root/StatisticsGen/statistics/2/Split-train.
INFO:absl:Generating statistics for split eval.
INFO:absl:Statistics for split eval written to ./pipeline_output_root/StatisticsGen/statistics/2/Split-eval.
INFO:absl:Running publisher for StatisticsGen
INFO:absl:MetadataStore with DB connection initialized
ExecutionResult at 0x7f0224424fa0
.execution_id 2

Artifact at ./pipeline_output_root/StatisticsGen/statistics/2



schema_gen = tfx.components.SchemaGen(
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Running driver for SchemaGen
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:Running executor for SchemaGen
INFO:absl:Processing schema from statistics for split train.
INFO:absl:Processing schema from statistics for split eval.
INFO:absl:Schema written to ./pipeline_output_root/SchemaGen/schema/3/schema.pbtxt.
INFO:absl:Running publisher for SchemaGen
INFO:absl:MetadataStore with DB connection initialized
ExecutionResult at 0x7f025352d0d0
.execution_id 3

Artifact at ./pipeline_output_root/SchemaGen/schema/3

Type Presence Valency Domain
Feature name
'company' STRING required 'company'
'dropoff_census_tract' INT required -
'dropoff_community_area' INT required -
'dropoff_latitude' FLOAT required -
'dropoff_longitude' FLOAT required -
'fare' FLOAT required single -
'payment_type' STRING required single 'payment_type'
'pickup_census_tract' INT required -
'pickup_community_area' INT required -
'pickup_latitude' FLOAT required -
'pickup_longitude' FLOAT required -
'tips' FLOAT required single -
'trip_miles' FLOAT required single -
'trip_seconds' INT required -
'trip_start_day' INT required single -
'trip_start_hour' INT required single -
'trip_start_month' INT required single -
'trip_start_timestamp' INT required single -
'company' '0118 - 42111 Godfrey S.Awir', '1085 - 72312 N and W Cab Co', '2192 - 73487 Zeymane Corp', '2733 - 74600 Benny Jona', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3319 - CD Cab Co', '3385 - 23210 Eman Cab', '3385 - Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - 57856 Ilie Malec', '4053 - 40193 Adwar H. Nikola', '4197 - 41842 Royal Star', '4197 - Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '585 - 88805 Valley Cab Co', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5874 - Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '6488 - 83287 Zuha Taxi', '6574 - Babylon Express Inc.', '6742 - 83735 Tasha ride inc', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation', '0694 - 59280 Chinesco Trans Inc', '2092 - 61288 Sbeih company', '2192 - Zeymane Corp', '2809 - 95474 C & D Cab Co Inc.', '2823 - 73307 Seung Lee', '3094 - 24059 G.L.B. Cab Co', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '5006 - Salifu Bawa', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5997 - AW Services Inc.', '6057 - 24657 Richard Addo', '6743 - Luhak Corp'
'payment_type' 'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown', 'Prcard'


example_validator = tfx.components.ExampleValidator(
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Running driver for ExampleValidator
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:Running executor for ExampleValidator
INFO:absl:Validating schema against the computed statistics for split train.
INFO:absl:Validation complete for split train. Anomalies written to ./pipeline_output_root/ExampleValidator/anomalies/4/Split-train.
INFO:absl:Validating schema against the computed statistics for split eval.
INFO:absl:Validation complete for split eval. Anomalies written to ./pipeline_output_root/ExampleValidator/anomalies/4/Split-eval.
INFO:absl:Running publisher for ExampleValidator
INFO:absl:MetadataStore with DB connection initialized
ExecutionResult at 0x7f02534bf2b0
.execution_id 4

Artifact at ./pipeline_output_root/ExampleValidator/anomalies/4

'train' split:

No anomalies found.

'eval' split:

No anomalies found.


_taxi_constants_module_file = ''
%%writefile {_taxi_constants_module_file}

NUMERICAL_FEATURES = ['trip_miles', 'fare', 'trip_seconds']

    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
# Number of buckets used by tf.transform for encoding each feature.

    'trip_start_hour', 'trip_start_day', 'trip_start_month',
    'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',


# Number of vocabulary terms used for encoding categorical features.

# Count of out-of-vocab buckets in which unrecognized categorical are hashed.

# Keys
LABEL_KEY = 'tips'
FARE_KEY = 'fare'

def t_name(key):
  Rename the feature keys so that they don't clash with the raw keys when
  running the Evaluator component.
    key: The original feature key
    key with '_xf' appended
  return key + '_xf'
_taxi_transform_module_file = ''
%%writefile {_taxi_transform_module_file}

import tensorflow as tf
import tensorflow_transform as tft

# Imported files such as taxi_constants are normally cached, so changes are
# not honored after the first import.  Normally this is good for efficiency, but
# during development when we may be iterating code it can be a problem. To
# avoid this problem during development, reload the file.
import taxi_constants
import sys
if 'google.colab' in sys.modules:  # Testing to see if we're doing development
  import importlib

_VOCAB_SIZE = taxi_constants.VOCAB_SIZE
_OOV_SIZE = taxi_constants.OOV_SIZE
_FARE_KEY = taxi_constants.FARE_KEY
_LABEL_KEY = taxi_constants.LABEL_KEY

def _make_one_hot(x, key):
  """Make a one-hot tensor to encode categorical features.
    X: A dense tensor
    key: A string key for the feature in the input
    A dense one-hot tensor as a float list
  integerized = tft.compute_and_apply_vocabulary(x,
          vocab_filename=key, name=key)
  depth = (
      tft.experimental.get_vocabulary_size_by_name(key) + _OOV_SIZE)
  one_hot_encoded = tf.one_hot(
      depth=tf.cast(depth, tf.int32),
  return tf.reshape(one_hot_encoded, [-1, depth])

def _fill_in_missing(x):
  """Replace missing values in a SparseTensor.
  Fills in missing values of `x` with '' or 0, and converts to a dense tensor.
    x: A `SparseTensor` of rank 2.  Its dense shape should have size at most 1
      in the second dimension.
    A rank 1 tensor where missing values of `x` have been filled in.
  if not isinstance(x, tf.sparse.SparseTensor):
    return x

  default_value = '' if x.dtype == tf.string else 0
  return tf.squeeze(
          tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]),

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.
    inputs: map from feature keys to raw not-yet-transformed features.
    Map from string feature key to transformed feature operations.
  outputs = {}
    # If sparse make it dense, setting nan's to 0 or '', and apply zscore.
    outputs[taxi_constants.t_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]), name=key)

  for key in _BUCKET_FEATURES:
    outputs[taxi_constants.t_name(key)] = tf.cast(tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT, name=key),

    outputs[taxi_constants.t_name(key)] = _make_one_hot(_fill_in_missing(inputs[key]), key)

    outputs[taxi_constants.t_name(key)] = _make_one_hot(tf.strings.strip(
        tf.strings.as_string(_fill_in_missing(inputs[key]))), key)

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_LABEL_KEY] = tf.where(
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs
transform = tfx.components.Transform(
INFO:absl:Generating ephemeral wheel package for '/mnt/c/Users/DELL/jupyter_notebook_code/chicago_taxi_pipeline/' (including modules: ['taxi_constants', 'taxi_trainer', 'taxi_transform']).
INFO:absl:User module package has hash fingerprint version 79f9cdb8dcb0633411b76b3906a3770b749e6c7c16484cb1f26a1a8c7cbf516a.
INFO:absl:Executing: ['/home/xzy/anaconda3/envs/tfx/bin/python', '/tmp/tmpnf6nwm4u/', 'bdist_wheel', '--bdist-dir', '/tmp/tmpsb53mzqc', '--dist-dir', '/tmp/tmpoct7sroz']
/home/xzy/anaconda3/envs/tfx/lib/python3.9/site-packages/setuptools/_distutils/ SetuptoolsDeprecationWarning: install is deprecated.

        Please avoid running ```` directly.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See for details.

INFO:absl:Successfully built user code wheel distribution at './pipeline_output_root/_wheels/tfx_user_code_Transform-0.0+79f9cdb8dcb0633411b76b3906a3770b749e6c7c16484cb1f26a1a8c7cbf516a-py3-none-any.whl'; target user module is 'taxi_transform'.
INFO:absl:Full user module path is 'taxi_transform@./pipeline_output_root/_wheels/tfx_user_code_Transform-0.0+79f9cdb8dcb0633411b76b3906a3770b749e6c7c16484cb1f26a1a8c7cbf516a-py3-none-any.whl'
INFO:absl:Running driver for Transform
INFO:absl:MetadataStore with DB connection initialized

running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying -> build/lib
copying -> build/lib
copying -> build/lib
installing to /tmp/tmpsb53mzqc
running install
running install_lib
copying build/lib/ -> /tmp/tmpsb53mzqc
copying build/lib/ -> /tmp/tmpsb53mzqc
copying build/lib/ -> /tmp/tmpsb53mzqc
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
reading manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
ExecutionResult at 0x7f0234132430
.execution_id 5

We can also take a look at the first three transformed examples

# Get the URI of the output artifact representing the transformed examples, which is a directory
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset =, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
# 输出过长
for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
features {
  feature {
    key: "company_xf"
    value {
      float_list {
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0

2023-06-27 11:01:39.933543: I tensorflow/core/common_runtime/] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
