我正在尝试对一些 TF2 keras 代码进行基准测试 - 具体而言,将 JIT 编译的性能与非 JITed 进行比较。tf.test.Benchmark
在没有 JIT 的情况下给出合理的外观结果 - 与输出相比,内存使用量大致一致nvidia-smi
,时间非常接近model.fit
- 但 JITed 版本报告的内存使用量很小(<1Mb,与没有 JIT 的 2.2Gb 相比),并且时间始终小于约 30%期间花费的时间model.fit
。
下面提供的代码。我有3个主要问题:
- 如何准确了解 JIT 模型的内存使用情况?
model.fit
基准调用和JIT 模型之间速度差异的根源是什么?- TF 2 的做法是什么?我正在使用会话和
tf.compat.v1.data.make_one_shot_iterator
,但肯定有一种方法使用@tf.function
或什么?有没有可以更好地做到这一点的非 TF 工具?
from absl import logging
import tensorflow as tf
import tensorflow_datasets as tfds
ALLOW_GROWTH = False # switch to this to use nvidia-smi
JIT = True
TFDS_NAME = 'mnist'
SHAPE = (28, 28, 1)
BATCH_SIZE = 64
NUM_CLASSES = 10
NUM_LAYERS = 20
UNITS = 4096
TRY_GCS = False # switch this if running on colab
TRAIN_STEPS = 200
BURN_ITERS = 200
MIN_ITERS = 200
def model_fn(inp):
layers = tf.keras.layers
x = layers.Flatten()(inp)
for _ in range(NUM_LAYERS):
x = layers.Dense(UNITS)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
logits = layers.Dense(NUM_CLASSES)(x)
model = tf.keras.Model(inp, logits)
model.compile(
optimizer=tf.keras.optimizers.SGD(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
return model
def get_dataset():
return tfds.load(
TFDS_NAME,
split='train',
as_supervised=True,
in_memory=True,
try_gcs=TRY_GCS).repeat().shuffle(1024).map(
lambda image, label: (tf.cast(image, tf.float32) / 255, label),
tf.data.experimental.AUTOTUNE).batch(BATCH_SIZE).prefetch(
tf.data.experimental.AUTOTUNE)
def fit(epochs=2, steps_per_epoch=TRAIN_STEPS):
dataset = get_dataset()
model = model_fn(tf.keras.Input(shape=SHAPE, dtype=tf.float32))
model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=epochs)
def benchmark(burn_iters=BURN_ITERS, min_iters=MIN_ITERS):
with tf.Graph().as_default():
dataset = get_dataset()
image, labels = tf.compat.v1.data.make_one_shot_iterator(
dataset).get_next()
model = model_fn(tf.keras.Input(tensor=image))
logits, = model.outputs
optimizer = model.optimizer
weights = model.weights
loss = model.loss(labels, logits)
grads = optimizer.get_gradients(loss, weights)
grads_and_vars = tuple(
(g, v) for g, v in zip(grads, weights) if g is not None)
op = optimizer.apply_gradients(grads_and_vars)
op = tf.group((op,) + tuple(model.updates)) # <---
bm = tf.test.Benchmark()
with tf.compat.v1.Session() as sess:
logging.info('Initializing variables...')
variables = model.weights + optimizer.weights
for name in ('learning_rate', 'momentum'):
a = getattr(optimizer, name, None)
if isinstance(a, tf.Variable):
variables.append(a)
sess.run([v.initializer for v in variables])
logging.info('Starting benchmarking...')
result = bm.run_op_benchmark(sess,
op,
burn_iters=burn_iters,
min_iters=min_iters)
logging.info('Wall time (ms): {}'.format(result['wall_time'] *
1000))
gpu_mem = result['extras'].get(
'allocator_maximum_num_bytes_GPU_0_bfc', 0)
logging.info('Memory (Mb): {}'.format(gpu_mem / 1024**2))
logging.set_verbosity(logging.INFO)
tf.config.optimizer.set_jit(JIT)
for device in tf.config.experimental.get_visible_devices('GPU'):
tf.config.experimental.set_memory_growth(device, ALLOW_GROWTH)
benchmark()
fit()