Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Split queue into train/test set

I set up my pipeline starting with a filename queue as in the following pseudocode:

filename_queue = tf.train.string_input_producer(["file0.pd", "file1.pd"])

pointing to TFRecords containing multiple serialized tf.train.Example images. Following the tensorflow guide a function which reads one example:

def read_my_file_format(filename_queue):
  reader = tf.SomeReader()
  key, record_string = reader.read(filename_queue)
  example, label = tf.some_decoder(record_string)
  processed_example = some_processing(example)
  return processed_example, label

which is used for a batch queue:

def input_pipeline(filenames, batch_size):
  filename_queue = tf.train.string_input_producer(filenames)
  example, label = read_my_file_format(filename_queue)

  example_batch, label_batch = tf.train.shuffle_batch(
      [example, label], batch_size=batch_size, capacity=100,
      min_after_dequeue=10)
  return example_batch, label_batch

I am looking for a way to split the data randomly into training and test sets. I don't want to save the training and test set into different files, but that the images are randomly assigned to the training or the test set independent of the file they are read from. Ideally I would like to split the input pipeline into a training and test queue.

Here is what I normally do in numpy when I have to split a huge dataset

import numpy as np
from numpy.random import choice
from numpy.random import RandomState

queue = range(10)
weights = (.8,.2) # create 2 partitions with this weights

def sampler(partition, seed=0):
    rng = RandomState(seed)
    return lambda x: rng.choice(np.arange(len(weights)), p=weights) == partition

def split(queue, weights):
    # filter the queue for each partition
    return [filter(sampler(partition), queue) for partition in range(len(weights)) ]

(train, test) = split(queue, weights)               


print(list(train)) # [0, 1, 2, 3, 4, 5, 6, 9]
print(list(test))  # [7, 8]
like image 644
Manuel Schmidt Avatar asked Apr 04 '17 17:04

Manuel Schmidt


1 Answers

Suggestion, using Tensorflow Dataset API (map(), interleave(), filter()):

import tensorflow as tf
import numpy as np

def _parse_function(example_proto):
    """ Parse TFRecord data """
    features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
              "label": tf.FixedLenFeature((), tf.int64, default_value=0)}
    parsed_features = tf.parse_single_example(example_proto, features)
    return parsed_features

def split_train_test(parsed_features, train_rate=0.8, seed=11):
    """ Randomly classify samples into training or testing split """
    # Snippet by Igor Gadelha Pereira (https://stackoverflow.com/a/49825457/624547)
    parsed_features['is_train'] = tf.gather(tf.random_uniform([1], seed=seed) < train_rate, 0)
    return parsed_features

def filter_per_split(parsed_features, train=True):
    """ Filter samples depending on their split """
    return parsed_features['is_train'] if train else ~parsed_features['is_train']

def select_features(parsed_features, keys=["image", "label"]):
    """ Return array of features selected by key """
    selected_features = [parsed_features[key] for key in keys]
    return selected_features

weights = (.8,.2)
num_files = 3
file_block_length = 1
files = ["/tmp/file{}.tfrecords".format(i) for i in range(num_files)]
# ... where file{i}.tfrecords contains:
# [{"label": i, "image": "class_{}/img_{}.png".format(i, k)} for k in range(10)]

# Create TFRecord file list list:
files = tf.data.Dataset.from_tensor_slices(files)
# Interleave all records:
dataset = files.interleave(lambda x: tf.data.TFRecordDataset(x),
                           cycle_length=num_files, block_length=file_block_length)
# ^ dataset containing:
# [rec0@file0, rec0@file1, rec0@file2, rec1@file0, rec1@file1, rec1@file2, ...]

# Parse TFRecord samples:
dataset = dataset.map(_parse_function)

# Randomly classify samples between training or testing:
dataset = dataset.map(lambda x: split_train_test(x, train_rate=weights[0]))

# Split into 2 datasets accordingly:
dataset_train = dataset.filter(lambda x: filter_per_split(x, train=True))
dataset_test = dataset.filter(lambda x: filter_per_split(x, train=False))

# Opt. remove "is_train" key, keeping only the original features:
dataset_train = dataset_train.map(select_features)
dataset_test = dataset_test.map(select_features)

# Use:
iterator_train = dataset_train.make_one_shot_iterator()
iterator_test = dataset_test.make_one_shot_iterator()
with tf.Session() as sess:
    for it, name in zip([iterator_train, iterator_test], ["Training", "Testing"]):
        x = it.get_next()
        count = 0
        print("{} Split:".format(name))
        try:
            while True:
                print(sess.run(x))
                count += 1
        except:
            print("- End of Split ({} / {}".format(count, num_files * 10))

Output:

Training Split:
(b'class_0/img_0.png', 0)
(b'class_1/img_0.png', 1)
(b'class_2/img_0.png', 2)
(b'class_0/img_1.png', 0)
(b'class_1/img_1.png', 1)
(b'class_1/img_2.png', 1)
(b'class_2/img_2.png', 2)
(b'class_0/img_3.png', 0)
(b'class_1/img_3.png', 1)
(b'class_2/img_3.png', 2)
(b'class_1/img_4.png', 1)
(b'class_2/img_4.png', 2)
(b'class_0/img_5.png', 0)
(b'class_1/img_5.png', 1)
(b'class_2/img_5.png', 2)
(b'class_0/img_6.png', 0)
(b'class_1/img_6.png', 1)
(b'class_2/img_6.png', 2)
(b'class_0/img_7.png', 0)
(b'class_1/img_7.png', 1)
(b'class_2/img_7.png', 2)
(b'class_0/img_8.png', 0)
(b'class_1/img_8.png', 1)
(b'class_2/img_8.png', 2)
(b'class_0/img_9.png', 0)
(b'class_1/img_9.png', 1)
(b'class_2/img_9.png', 2)
- End of Split (27 / 30
Testing Split:
(b'class_2/img_1.png', 2)
(b'class_0/img_2.png', 0)
(b'class_0/img_4.png', 0)
- End of Split (3 / 30
like image 146
benjaminplanche Avatar answered Oct 30 '22 18:10

benjaminplanche