1

我使用 Titan Xp GPU。代码在下面,但我不知道问题出在哪里。为什么每个 epoch 的训练时间不断增加?最初我每分钟可以处理大约 180 个批次,但在三个 epoch 之后,我每分钟只能处理 5 个批次。

train_image = tf.data.Dataset.from_tensor_slices(X_train)
train_image_dataset = train_image.map(lambda x: tf.io.decode_png(tf.io.read_file(x)), num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_label_dataset = tf.data.Dataset.from_tensor_slices(tf.one_hot(Y_train, depth=num_classes))
train_dataset = tf.data.Dataset.zip((train_image_dataset, train_label_dataset)).shuffle(batch_size*3).batch(batch_size)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_image = tf.data.Dataset.from_tensor_slices(X_val)
val_image_dataset = val_image.map(lambda x: tf.io.decode_png(tf.io.read_file(x)), num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_label_dataset = tf.data.Dataset.from_tensor_slices(tf.one_hot(Y_val, depth=num_classes))
val_dataset = tf.data.Dataset.zip((val_image_dataset, val_label_dataset)).shuffle(batch_size*3).batch(batch_size)
val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)


##### CALLBACKS
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss',  # checkpoint callback
                                            verbose=1, save_weights_only=True,
                                            save_freq='epoch', save_best_only=True)

tensorboard_callbacks = tf.keras.callbacks.TensorBoard(log_dir=log_folder, update_freq='epoch')
reduce_lr = LearningRateScheduler(step_decay, verbose=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
callback = [reduce_lr, early_stopping, checkpoint, tensorboard_callbacks]

opt = tf.keras.optimizers.Adam()

METRICS = [
tf.keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
tf.keras.metrics.AUC(name='auc'),
f1_score,
]

model = tf.keras.applications.MobileNetV2(include_top=True, weights=None, input_shape=(224, 224, 1), classes=num_classes)
model.load_weights("/weights-05-val_loss-0.215.hdf5")
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=[METRICS])
model.fit(train_dataset,
      epochs=100,
      validation_data=val_dataset,
      callbacks=callback,
      verbose=1,
      shuffle=True,
      initial_epoch=5)

纪元 00006: 77428/77428 [==============================] - 59041s 763ms/步

纪元 00007: 77427/77428 [============================>.] - 68783s 888ms/step

4

0 回答 0