“TensorFlow - Importing data”

Basic

Data can be feed into TensorFlow using iterator.

import tensorflow as tf

dataset = tf.data.Dataset.range(10)

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(10):
      value = sess.run(next_element)
      print(f"{value} ", end=" ")    # 1 2 3 ... 10	   

The datatype and the shape of the dataset can be retrieved by:

print(dataset.output_types)   # <dtype: 'int64'>
print(dataset.output_shapes)  # () - scalar

Out of range

An iterator can run out of values. Handling iterator’s out of range:

import tensorflow as tf

dataset = tf.data.Dataset.range(3)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

result = tf.add(next_element, next_element)

with tf.Session() as sess:
    print(sess.run(result))  # "0"
    print(sess.run(result))  # "2"
    print(sess.run(result))  # "4"
    try:
      sess.run(result)
    except tf.errors.OutOfRangeError:
      print("End of dataset")  # "End of dataset"

If we want the iterator to keep repeat the data, we can call repeat so the iterator will repeat itself at the end.

dataset = tf.data.Dataset.range(3)
dataset = dataset.repeat()

Create an iterator

One-shot

As demonstrated before:

dataset = tf.data.Dataset.range(10)

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(10):
      value = sess.run(next_element)

Note: The max range is pre-determined when building the iterator.

initializable iterator

In the example below, we allow the max range of the iterator to be supplied at runtime using a placeholder.

import tensorflow as tf

max_value = tf.placeholder(tf.int64, shape=[])
dataset = tf.data.Dataset.range(max_value)    # Take a placeholder to create a dataset
iterator = dataset.make_initializable_iterator()      # Create an initializable iterator
next_element = iterator.get_next()

with tf.Session() as sess:
    # Initialize an iterator over a dataset with 10 elements using placeholder.
    sess.run(iterator.initializer, feed_dict={max_value: 10}) 

    for i in range(10):
        value = sess.run(next_element)
        print(f"{value} ", end=" ")    # 0 1 2 3 ... 9

reinitializable iterator

We can create an iterator for different datasets. For example, in training, we use the training dataset for the iterator and the validation dataset for the validation. For reinitializable iterator, both dataset must have the same datatype and shape.

import tensorflow as tf


training_dataset = tf.data.Dataset.range(100).map(
    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
validation_dataset = tf.data.Dataset.range(50)

# Build an iterator that can take different datasets with the same type and shape
iterator = tf.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
next_element = iterator.get_next()

# Get 2 init op for 2 different dataset
training_init_op = iterator.make_initializer(training_dataset)
validation_init_op = iterator.make_initializer(validation_dataset)

with tf.Session() as sess:
    for _ in range(20):
      sess.run(training_init_op)
      for _ in range(100):
        sess.run(next_element)

      sess.run(validation_init_op)
      for _ in range(50):
        sess.run(next_element)

Feedable iterator

In reinitializable iterator, we reinitialize the iterator evertime when we switch the dataset. In Feedable iterator, the dataset is supplied in the feed_dict in tf.Session.run without the reinitialization.

import tensorflow as tf


# Create 2 dataset witht the same datatype and shape
training_dataset = tf.data.Dataset.range(300).map(lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
validation_dataset = tf.data.Dataset.range(50)

# Create a feedable iterator that use a placeholder to switch between dataset
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(
    handle, training_dataset.output_types, training_dataset.output_shapes)
next_element = iterator.get_next()

# Create 2 iterators
training_iterator = training_dataset.make_one_shot_iterator()
validation_iterator = validation_dataset.make_initializable_iterator()

with tf.Session() as sess:
    # Return handles that can be feed as the iterator in sess.run
    training_handle = sess.run(training_iterator.string_handle())
    validation_handle = sess.run(validation_iterator.string_handle())

    for _ in range(3):
        for _ in range(100):
            sess.run(next_element, feed_dict={handle: training_handle})

        sess.run(validation_iterator.initializer)
        for _ in range(50):
            sess.run(next_element, feed_dict={handle: validation_handle})

Dataset shape

Produce data with shape (10,)

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))

iterator = dataset.make_initializable_iterator()      # Create an initializable iterator
next_element = iterator.get_next()

with tf.Session() as sess:
    sess.run(iterator.initializer)

    for i in range(4):
      value = sess.run(next_element)
      print(f"{value} ")        # Print out an array with 10 random numbers

Produce data with shape ((), (100,)): a tuple with the first element as a float32 scalar and the second element as an array with 100 int32.

dataset = tf.data.Dataset.from_tensor_slices(
   (tf.random_uniform([4]),              # (tf.float32, tf.int32)
    tf.random_uniform([4, 100], maxval=100, dtype=tf.int32))) # ((), (100,))

Zip 2 dataset

dataset = tf.data.Dataset.zip((dataset1, dataset2))
print(dataset3.output_types)  # ==> (tf.float32, (tf.float32, tf.int32))
print(dataset3.output_shapes)  # ==> "(10, ((), (100,)))

Data can be read as

next1, (next21, next22) = iterator.get_next()

Giving labels:

dataset = tf.data.Dataset.from_tensor_slices(
   {"a": tf.random_uniform([4]),
    "b": tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)})
print(dataset.output_types)  # ==> "{'a': tf.float32, 'b': tf.int32}"
print(dataset.output_shapes)  # ==> "{'a': (), 'b': (100,)}"

Consuming Numpy array as data

Saving data in Numpy

import numpy as np
import tensorflow as tf

dt = np.dtype([('features', float, (2,)),
                ('label', int)])

x = np.zeros((2,), dtype=dt)
x[0]['features'] = [3.0, 2.5]
x[0]['label'] = 2

x[1]['features'] = [1.4, 2.1]
x[1]['label'] = 1

np.save('in.npy', x)

Reading Numpy data as TensorFlow dataset.

with np.load("in.npy") as data:
  features = data["features"]
  labels = data["labels"]
  
features_placeholder = tf.placeholder(features.dtype, features.shape)
labels_placeholder = tf.placeholder(label.dtype, label.shape)

dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
iterator = dataset.make_initializable_iterator()

with tf.Session() as sess:
    sess.run(iterator.initializer, feed_dict={features_placeholder: features, labels_placeholder: label})

Consuming TFRecord

To create a dataset from TFRecord and have the iteration keep repeating.

filenames = get_filenames()   # Array of filename paths as string 
                              # ["/data/f1.tfrecord" "/data/f2.tfrecord"]
dataset = tf.data.TFRecordDataset(filenames).repeat()

Create the operators to parse the dataset.

def parser(serialized_example):
    """Parses a single tf.Example into image and label tensors."""
	...
    return image, label
	
dataset = dataset.map(
        parser, num_threads=batch_size, output_buffer_size=2 * batch_size)

# Batch it up.
dataset = dataset.batch(batch_size)

Create the iterator operators:

iterator = dataset.make_one_shot_iterator()
image_batch, label_batch = iterator.get_next()

The full source:

import tensorflow as tf
import os

HEIGHT = 32
WIDTH = 32
DEPTH = 3

NUM_PER_EPOCH = 50000

def parser(serialized_example):
    """Parses a single tf.Example into image and label tensors."""
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64),
        })
    image = tf.decode_raw(features['image'], tf.uint8)
    image.set_shape([DEPTH * HEIGHT * WIDTH])

    # Reshape from [depth * height * width] to [depth, height, width].
    image = tf.cast(
        tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
        tf.float32)
    label = tf.cast(features['label'], tf.int32)

    return image, label


def make_batch(batch_size):
    """Read the images and labels from 'filenames'."""
    filenames = [os.path.join(".", 'f1.tfrecords'), os.path.join(".", 'f2.tfrecords')]

    # Repeat infinitely.
    dataset = tf.data.TFRecordDataset(filenames).repeat()

    # Parse records.
    dataset = dataset.map(
        parser, num_threads=batch_size, output_buffer_size=2 * batch_size)

    # Potentially shuffle records.
    min_queue_examples = int(NUM_PER_EPOCH * 0.4)
    dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)

    # Batch it up.
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    image_batch, label_batch = iterator.get_next()

    return image_batch, label_batch

it maintains a fixed-size buffer and chooses the next element randomly from the buffer.

dataset = dataset.shuffle(buffer_size=10000)

Parsing

Parsing each tfrecords

def _parse(example_proto):
  features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
              "label": tf.FixedLenFeature((), tf.int32, default_value=0)}
  parsed_features = tf.parse_single_example(example_proto, features)
  return parsed_features["image"], parsed_features["label"]

filenames = [os.path.join(".", 'f1.tfrecords'), os.path.join(".", 'f2.tfrecords')]
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse)

Parsing images

def _parse(filename, label):
  """ Reading and resize image"""
  image_string = tf.read_file(filename)
  image_decoded = tf.image.decode_image(image_string)
  image_resized = tf.image.resize_images(image_decoded, [28, 28])
  return image_resized, label

# A vector of filenames.
filenames = tf.constant([os.path.join(".", 'f1.jpg'), os.path.join(".", 'f2.jpg')])
labels = tf.constant([2, 5])

dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
dataset = dataset.map(_parse)

Using TF library is preferred over external libraries for performance reason. Nevertheless, if calling external libraries are needed, use tf.py_func.

import cv2

def _read_py_function(filename, label):
  image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
  return image_decoded, label

def _resize_function(image_decoded, label):
  image_decoded.set_shape([None, None, None])
  image_resized = tf.image.resize_images(image_decoded, [28, 28])
  return image_resized, label

filenames = tf.constant([os.path.join(".", 'f1.jpg'), os.path.join(".", 'f2.jpg')])
labels = tf.constant([2, 5])

dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
dataset = dataset.map(
    lambda filename, label: tf.py_func(
        _read_py_function, [filename, label], [tf.uint8, label.dtype]))
dataset = dataset.map(_resize_function)

Writing tfrecords

Example code to write data into tfrecords

import tensorflow as tf
from PIL import Image
import numpy as np
import os

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

filename = os.path.join(".", 'f1.tfrecords')

writer = tf.python_io.TFRecordWriter(filename)

img = np.array(Image.open(os.path.join(".", 'f1.jpg')))

height = img.shape[0]
width = img.shape[1]


img_raw = img.tostring()

example = tf.train.Example(features=tf.train.Features(feature={
    'height': _int64_feature(height),
    'width': _int64_feature(width),
    'image_raw': _bytes_feature(img_raw)}))

writer.write(example.SerializeToString())

writer.close()

Reading from text lines

Create a text line dataset

filenames = [os.path.join(".", 'f1.txt'), os.path.join(".", 'f2.txt')]
dataset = tf.data.TextLineDataset(filenames)

Filter out first line and comments

filenames = [os.path.join(".", 'f1.txt'), os.path.join(".", 'f2.txt')]

dataset = tf.data.Dataset.from_tensor_slices(filenames)

dataset = dataset.flat_map(
    lambda filename: (
        tf.data.TextLineDataset(filename)
        .skip(1)       # Skip first line
        .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#")))) # Skip comment line

Batching

To create a mini-batch:

dataset = tf.data.Dataset.range(100)
batched_dataset = dataset.batch(4)

iterator = batched_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    print(sess.run(next_element))  # [0 1 2 3]

Use padded_batch for padding batches.

dataset = tf.data.Dataset.range(13)

# For x=0 -> [0], x=2 -> [2, 2], x=3 -> [3, 3, 3]
dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))

# Create a mini-batch of size 4. Pad 0 if needed.
dataset = dataset.padded_batch(4, padded_shapes=[None])

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    print(sess.run(next_element))  # ==> [[0, 0, 0], [1, 0, 0], [2, 2, 0], [3, 3, 3]]
    print(sess.run(next_element))  # ==> [[4, 4, 4, 4, 0, 0, 0],
                                   #      [5, 5, 5, 5, 5, 0, 0],
                                   #      [6, 6, 6, 6, 6, 6, 0],
                                   #      [7, 7, 7, 7, 7, 7, 7]]

To run 10 epochs:

for _ in range(10):
  sess.run(iterator.initializer)
  while True:
    try:
      sess.run(next_element)
    except tf.errors.OutOfRangeError:
      break

MonitoredTrainingSession

MonitoredTrainingSession uses OutOfRangeError to signal that training has completed. It is recommended to use make_one_shot_iterator with it.

iterator = dataset.make_one_shot_iterator()
...

with tf.train.MonitoredTrainingSession(...) as sess:
  while not sess.should_stop():
    sess.run(training_op)

Estimator with an input function

Create an input function in iris_data:

def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

Pass the function into the estimator:

...
classifier = tf.estimator.Estimator(
     model_fn=my_model,
     params={
         'feature_columns': my_feature_columns,
         # Two hidden layers of 10 nodes each.
         'hidden_units': [10, 10],
         # The model must choose between 3 classes.
         'n_classes': 3,
     })

# Train the Model.
classifier.train(
     input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size),
     steps=args.train_steps)
...	 

Estimator with a one-hot iterator

Use make_one_shot_iterator with the Estimator.

import tensorflow as tf
from PIL import Image
import numpy as np
import os

def train_input_fn():
  filenames = ["./file1.tfrecord", "./file2.tfrecord"]
  dataset = tf.data.TFRecordDataset(filenames)

  def parser(record):
    keys_to_features = {
        "image_data": tf.FixedLenFeature((), tf.string, default_value=""),
        "date_time": tf.FixedLenFeature((), tf.int64, default_value=""),
        "label": tf.FixedLenFeature((), tf.int64,
                                    default_value=tf.zeros([], dtype=tf.int64)),
    }
    parsed = tf.parse_single_example(record, keys_to_features)

    image = tf.decode_jpeg(parsed["image_data"])
    image = tf.reshape(image, [299, 299, 1])
    label = tf.cast(parsed["label"], tf.int32)

    return {"image_data": image, "date_time": parsed["date_time"]}, label

  dataset = dataset.map(parser)
  dataset = dataset.shuffle(buffer_size=10000)
  dataset = dataset.batch(32)
  dataset = dataset.repeat(10)
  iterator = dataset.make_one_shot_iterator()

  features, labels = iterator.get_next()
  return features, labels

The train, evaluate, and predict methods of the Estimator require input functions to return a tuple containing TensorFlow tensors. (features, label)