I set up my pipeline starting with a filename queue as in the following pseudocode:
filename_queue = tf.train.string_input_producer(["file0.pd", "file1.pd"])
pointing to TFRecords
containing multiple serialized tf.train.Example
images.
Following the tensorflow guide a function which reads one example:
def read_my_file_format(filename_queue):
reader = tf.SomeReader()
key, record_string = reader.read(filename_queue)
example, label = tf.some_decoder(record_string)
processed_example = some_processing(example)
return processed_example, label
which is used for a batch queue:
def input_pipeline(filenames, batch_size):
filename_queue = tf.train.string_input_producer(filenames)
example, label = read_my_file_format(filename_queue)
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=100,
min_after_dequeue=10)
return example_batch, label_batch
I am looking for a way to split the data randomly into training and test sets. I don't want to save the training and test set into different files, but that the images are randomly assigned to the training or the test set independent of the file they are read from. Ideally I would like to split the input pipeline into a training and test queue.
Here is what I normally do in numpy when I have to split a huge dataset
import numpy as np
from numpy.random import choice
from numpy.random import RandomState
queue = range(10)
weights = (.8,.2) # create 2 partitions with this weights
def sampler(partition, seed=0):
rng = RandomState(seed)
return lambda x: rng.choice(np.arange(len(weights)), p=weights) == partition
def split(queue, weights):
# filter the queue for each partition
return [filter(sampler(partition), queue) for partition in range(len(weights)) ]
(train, test) = split(queue, weights)
print(list(train)) # [0, 1, 2, 3, 4, 5, 6, 9]
print(list(test)) # [7, 8]
Suggestion, using Tensorflow Dataset API (map()
, interleave()
, filter()
):
import tensorflow as tf
import numpy as np
def _parse_function(example_proto):
""" Parse TFRecord data """
features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
"label": tf.FixedLenFeature((), tf.int64, default_value=0)}
parsed_features = tf.parse_single_example(example_proto, features)
return parsed_features
def split_train_test(parsed_features, train_rate=0.8, seed=11):
""" Randomly classify samples into training or testing split """
# Snippet by Igor Gadelha Pereira (https://stackoverflow.com/a/49825457/624547)
parsed_features['is_train'] = tf.gather(tf.random_uniform([1], seed=seed) < train_rate, 0)
return parsed_features
def filter_per_split(parsed_features, train=True):
""" Filter samples depending on their split """
return parsed_features['is_train'] if train else ~parsed_features['is_train']
def select_features(parsed_features, keys=["image", "label"]):
""" Return array of features selected by key """
selected_features = [parsed_features[key] for key in keys]
return selected_features
weights = (.8,.2)
num_files = 3
file_block_length = 1
files = ["/tmp/file{}.tfrecords".format(i) for i in range(num_files)]
# ... where file{i}.tfrecords contains:
# [{"label": i, "image": "class_{}/img_{}.png".format(i, k)} for k in range(10)]
# Create TFRecord file list list:
files = tf.data.Dataset.from_tensor_slices(files)
# Interleave all records:
dataset = files.interleave(lambda x: tf.data.TFRecordDataset(x),
cycle_length=num_files, block_length=file_block_length)
# ^ dataset containing:
# [rec0@file0, rec0@file1, rec0@file2, rec1@file0, rec1@file1, rec1@file2, ...]
# Parse TFRecord samples:
dataset = dataset.map(_parse_function)
# Randomly classify samples between training or testing:
dataset = dataset.map(lambda x: split_train_test(x, train_rate=weights[0]))
# Split into 2 datasets accordingly:
dataset_train = dataset.filter(lambda x: filter_per_split(x, train=True))
dataset_test = dataset.filter(lambda x: filter_per_split(x, train=False))
# Opt. remove "is_train" key, keeping only the original features:
dataset_train = dataset_train.map(select_features)
dataset_test = dataset_test.map(select_features)
# Use:
iterator_train = dataset_train.make_one_shot_iterator()
iterator_test = dataset_test.make_one_shot_iterator()
with tf.Session() as sess:
for it, name in zip([iterator_train, iterator_test], ["Training", "Testing"]):
x = it.get_next()
count = 0
print("{} Split:".format(name))
try:
while True:
print(sess.run(x))
count += 1
except:
print("- End of Split ({} / {}".format(count, num_files * 10))
Output:
Training Split:
(b'class_0/img_0.png', 0)
(b'class_1/img_0.png', 1)
(b'class_2/img_0.png', 2)
(b'class_0/img_1.png', 0)
(b'class_1/img_1.png', 1)
(b'class_1/img_2.png', 1)
(b'class_2/img_2.png', 2)
(b'class_0/img_3.png', 0)
(b'class_1/img_3.png', 1)
(b'class_2/img_3.png', 2)
(b'class_1/img_4.png', 1)
(b'class_2/img_4.png', 2)
(b'class_0/img_5.png', 0)
(b'class_1/img_5.png', 1)
(b'class_2/img_5.png', 2)
(b'class_0/img_6.png', 0)
(b'class_1/img_6.png', 1)
(b'class_2/img_6.png', 2)
(b'class_0/img_7.png', 0)
(b'class_1/img_7.png', 1)
(b'class_2/img_7.png', 2)
(b'class_0/img_8.png', 0)
(b'class_1/img_8.png', 1)
(b'class_2/img_8.png', 2)
(b'class_0/img_9.png', 0)
(b'class_1/img_9.png', 1)
(b'class_2/img_9.png', 2)
- End of Split (27 / 30
Testing Split:
(b'class_2/img_1.png', 2)
(b'class_0/img_2.png', 0)
(b'class_0/img_4.png', 0)
- End of Split (3 / 30
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With