建议,使用 TensorFlow Dataset API ( map()
, interleave()
, filter()
):
import tensorflow as tf
import numpy as np
def _parse_function(example_proto):
""" Parse TFRecord data """
features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
"label": tf.FixedLenFeature((), tf.int64, default_value=0)}
parsed_features = tf.parse_single_example(example_proto, features)
return parsed_features
def split_train_test(parsed_features, train_rate=0.8, seed=11):
""" Randomly classify samples into training or testing split """
# Snippet by Igor Gadelha Pereira (https://stackoverflow.com/a/49825457/624547)
parsed_features['is_train'] = tf.gather(tf.random_uniform([1], seed=seed) < train_rate, 0)
return parsed_features
def filter_per_split(parsed_features, train=True):
""" Filter samples depending on their split """
return parsed_features['is_train'] if train else ~parsed_features['is_train']
def select_features(parsed_features, keys=["image", "label"]):
""" Return array of features selected by key """
selected_features = [parsed_features[key] for key in keys]
return selected_features
weights = (.8,.2)
num_files = 3
file_block_length = 1
files = ["/tmp/file{}.tfrecords".format(i) for i in range(num_files)]
# ... where file{i}.tfrecords contains:
# [{"label": i, "image": "class_{}/img_{}.png".format(i, k)} for k in range(10)]
# Create TFRecord file list list:
files = tf.data.Dataset.from_tensor_slices(files)
# Interleave all records:
dataset = files.interleave(lambda x: tf.data.TFRecordDataset(x),
cycle_length=num_files, block_length=file_block_length)
# ^ dataset containing:
# [rec0@file0, rec0@file1, rec0@file2, rec1@file0, rec1@file1, rec1@file2, ...]
# Parse TFRecord samples:
dataset = dataset.map(_parse_function)
# Randomly classify samples between training or testing:
dataset = dataset.map(lambda x: split_train_test(x, train_rate=weights[0]))
# Split into 2 datasets accordingly:
dataset_train = dataset.filter(lambda x: filter_per_split(x, train=True))
dataset_test = dataset.filter(lambda x: filter_per_split(x, train=False))
# Opt. remove "is_train" key, keeping only the original features:
dataset_train = dataset_train.map(select_features)
dataset_test = dataset_test.map(select_features)
# Use:
iterator_train = dataset_train.make_one_shot_iterator()
iterator_test = dataset_test.make_one_shot_iterator()
with tf.Session() as sess:
for it, name in zip([iterator_train, iterator_test], ["Training", "Testing"]):
x = it.get_next()
count = 0
print("{} Split:".format(name))
try:
while True:
print(sess.run(x))
count += 1
except:
print("- End of Split ({} / {}".format(count, num_files * 10))
输出:
Training Split:
(b'class_0/img_0.png', 0)
(b'class_1/img_0.png', 1)
(b'class_2/img_0.png', 2)
(b'class_0/img_1.png', 0)
(b'class_1/img_1.png', 1)
(b'class_1/img_2.png', 1)
(b'class_2/img_2.png', 2)
(b'class_0/img_3.png', 0)
(b'class_1/img_3.png', 1)
(b'class_2/img_3.png', 2)
(b'class_1/img_4.png', 1)
(b'class_2/img_4.png', 2)
(b'class_0/img_5.png', 0)
(b'class_1/img_5.png', 1)
(b'class_2/img_5.png', 2)
(b'class_0/img_6.png', 0)
(b'class_1/img_6.png', 1)
(b'class_2/img_6.png', 2)
(b'class_0/img_7.png', 0)
(b'class_1/img_7.png', 1)
(b'class_2/img_7.png', 2)
(b'class_0/img_8.png', 0)
(b'class_1/img_8.png', 1)
(b'class_2/img_8.png', 2)
(b'class_0/img_9.png', 0)
(b'class_1/img_9.png', 1)
(b'class_2/img_9.png', 2)
- End of Split (27 / 30
Testing Split:
(b'class_2/img_1.png', 2)
(b'class_0/img_2.png', 0)
(b'class_0/img_4.png', 0)
- End of Split (3 / 30