Datasets API read CSV files in input pipline

python - Reading CSV files in Tensorflow 1.2.0 - Stack Overflow

the new DataSet API which is the new way to go when dealing with data (see for example this answer).


def read_row(csv_row):
    record_defaults = [[0], [0.0], [0.0], [0.0], [""], [0], [0.0], [0.0], [0], [0]]
    row = tf.decode_csv(csv_row, record_defaults=record_defaults)
    return row[:-1], row[-1]


def input_pipeline(filenames, batch_size):
    # Define a `tf.contrib.data.Dataset` for iterating over one epoch of the data.
    dataset = (tf.contrib.data.TextLineDataset(filenames)
               .skip(1)
               .map(lambda line: read_row(line))
               .shuffle(buffer_size=10)  # Equivalent to min_after_dequeue=10.
               .batch(batch_size))

    # Return an *initializable* iterator over the dataset, which will allow us to
    # re-initialize it at the beginning of each epoch.
    return dataset.make_initializable_iterator()
iterator = input_pipeline(['heart.csv'], batch_size)
features, labels = iterator.get_next()

nof_examples = 10
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    sess.run(iterator.initializer)
    while nof_examples > 0:
        nof_examples -= 1
        try:
            data_features, data_labels = sess.run([features, labels])
            print(data_features)
        except tf.errors.OutOfRangeError:
            pass

neural network - How to use TensorFlow tf.train.string_input_producer to produce several epochs data? - Stack Overflow

————————————————
As Nicolas observes, the tf.train.string_input_producer() API does not give you the ability to detect when the end of an epoch is reached; instead it concatenates together all epochs into one long batch. For this reason, we recently added (in TensorFlow 1.2) the tf.contrib.data API, which makes it possible to express more sophisticated pipelines, including your use case.

The following code snippet shows how you would write your program using tf.contrib.data:

def input_pipeline(filenames, batch_size):
    # Define a `tf.contrib.data.Dataset` for iterating over one epoch of the data.
    dataset = (tf.contrib.data.TextLineDataset(filenames)
               .map(lambda line: tf.decode_csv(
                    line, record_defaults=[['1'], ['1'], ['1']], field_delim='-'))
               .shuffle(buffer_size=10)  # Equivalent to min_after_dequeue=10.
               .batch(batch_size))

    # Return an *initializable* iterator over the dataset, which will allow us to
    # re-initialize it at the beginning of each epoch.
    return dataset.make_initializable_iterator() 

filenames=['1.txt']
batch_size = 3
num_epochs = 10
iterator = input_pipeline(filenames, batch_size)

# `a1`, `a2`, and `a3` represent the next element to be retrieved from the iterator.    
a1, a2, a3 = iterator.get_next()

with tf.Session() as sess:
    for _ in range(num_epochs):
        # Resets the iterator at the beginning of an epoch.
        sess.run(iterator.initializer)

        try:
            while True:
                a, b, c = sess.run([a1, a2, a3])
                print(a, b, c)
        except tf.errors.OutOfRangeError:
            # This will be raised when you reach the end of an epoch (i.e. the
            # iterator has no more elements).
            pass                 

        # Perform any end-of-epoch computation here.
        print('Done training, epoch reached')

你可能感兴趣的:(Datasets API read CSV files in input pipline)