我正在尝试将 Optuna 与DeepSpeech集成以优化它的一些超参数。我现在坚持学习率,只是为了了解 Optuna 的工作原理,但我遇到了障碍,需要一些帮助。
我有一个功能hps_train
,它是训练步骤。它将 Optuna 试验对象作为参数并返回开发损失,这是我想使用 Optuna 来最小化的。这与 中的函数完全相同train()
,training/deepspeech_training/train.py
但有一些修改:
def hps_train(trial):
#.
#.Same as train() in https://github.com/mozilla/DeepSpeech/blob/master/training/deepspeech_training/train.py
#.
if FLAGS.horovod:
# Effective batch size in synchronous distributed training is scaled by the number of workers. An increase in learning rate compensates for the increased batch size.
optimizer = hps_create_optimizer(learning_rate_var * hvd.size())
optimizer = hvd.DistributedOptimizer(optimizer)
else:
optimizer, learning_rate_var = hps_create_optimizer(trial)
reduce_learning_rate_op = learning_rate_var.assign(
tf.multiply(learning_rate_var, FLAGS.plateau_reduction)
)
#.
#.Same as train() https://github.com/mozilla/DeepSpeech/blob/master/training/deepspeech_training/train.py
#.
with tfv1.Session(config=Config.session_config) as session:
#.
#.Same as train() https://github.com/mozilla/DeepSpeech/blob/master/training/deepspeech_training/train.py
#.
final_dev_loss = dev_losses[-1]
log_debug("Session closed.")
return final_dev_loss
我还有一些辅助功能:
def hps_create_optimizer(trial):
learning_rate = trial.suggest_float("adam_lr", 1e-5, 1e-1, log=True)
with tf.variable_scope("learning_rate", reuse=tf.AUTO_REUSE):
learning_rate_var = tfv1.get_variable(
"learning_rate", initializer=learning_rate, trainable=False
)
optimizer = tfv1.train.AdamOptimizer(
learning_rate=learning_rate_var, beta1=0.9, beta2=0.999, epsilon=1e-08
)
return optimizer, learning_rate_var
def new_trial_callback(study, trial):
chkpt_path = setup_dirs(study.study_name, trial.number + 1)
FLAGS.checkpoint_dir = chkpt_path
FLAGS.save_checkpoint_dir = chkpt_path
FLAGS.load_checkpoint_dir = chkpt_path
def objective(trial, session):
if FLAGS.train_files:
val_loss = hps_train(trial, session)
return float(val_loss)
def objective_tf(trial):
tfv1.reset_default_graph()
with tfv1.Graph().as_default():
return objective(trial, session)
把它们放在一起:
def main(_):
initialize_globals()
early_training_checks()
lr_study = optuna.create_study(study_name="lr_study", direction='minimize')
chkpt_dir = setup_dirs(lr_study.study_name, 0)
FLAGS.checkpoint_dir = chkpt_dir
FLAGS.save_checkpoint_dir = chkpt_dir
FLAGS.load_checkpoint_dir = chkpt_dir
lr_study.optimize(objective_tf, n_trials=25, callbacks=[new_trial_callback])
当我运行此代码时,第一次运行正常完成。但是,当它尝试启动第二个时,我收到一个错误:
$ python training/hparam_search.py --train_files ~/datasets/cv-corpus-1/en/clips/train.csv --dev_files ~/datasets/cv-corpus-1/en/clips/dev.csv --test_files ~/datasets/cv-corpus-1/en/clips/test.csv --train_batch_size 64 --test_batch_size 64 --dev_batch_size 64 --n_hidden 512 --epochs 1 --train_cudnn --use_allow_growth --checkpoint_dir checkpoints
[I 2021-08-30 15:06:16,637] A new study created in memory with name: lr_study
I Could not find best validating checkpoint.
I Could not find most recent checkpoint.
I Initializing all variables.
I STARTING Optimization
Epoch 0 | Training | Elapsed Time: 0:00:17 | Steps: 187 | Loss: 252.374135
Epoch 0 | Validation | Elapsed Time: 0:00:12 | Steps: 109 | Loss: 255.176724 | Dataset: /home/user/datasets/cv-corpus-1/en/clips/dev.csv
I Saved new best validating model with loss 255.176724 to: checkpoints/optuna_trials/lr_study/0/best_dev-187
--------------------------------------------------------------------------------
I FINISHED optimization in 0:00:30.553797
[I 2021-08-30 15:06:50,101] Trial 0 finished with value: 255.1767243551552 and parameters: {'adam_lr': 0.006636434104761772}. Best is trial 0 with value: 255.1767243551552.
[W 2021-08-30 15:06:50,229] Trial 1 failed because of the following error: ValueError('in converted code:\n relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:\n\n contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call\n training)\n contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward\n seed=self._seed)\n contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn\n outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)\n python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3\n time_major=time_major, name=name)\n python/framework/op_def_library.py:367 _apply_op_helper\n g = ops._get_graph_from_inputs(_Flatten(keywords.values()))\n python/framework/ops.py:5979 _get_graph_from_inputs\n _assert_same_graph(original_graph_element, graph_element)\n python/framework/ops.py:5914 _assert_same_graph\n (item, original_item))\n\n ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).\n',)
Traceback (most recent call last):
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "training/hparam_search.py", line 671, in objective_tf
return objective(trial)
File "training/hparam_search.py", line 660, in objective
val_loss = hps_train(trial)
File "training/hparam_search.py", line 332, in hps_train
iterator, optimizer, dropout_rates
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 195, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 133, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call
training)
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward
seed=self._seed)
contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3
time_major=time_major, name=name)
python/framework/op_def_library.py:367 _apply_op_helper
g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
python/framework/ops.py:5979 _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
python/framework/ops.py:5914 _assert_same_graph
(item, original_item))
ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).
Traceback (most recent call last):
File "training/hparam_search.py", line 691, in <module>
absl.app.run(main)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "training/hparam_search.py", line 684, in main
lr_study.optimize(objective_tf, n_trials=25, callbacks=[new_trial_callback])
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/study.py", line 409, in optimize
show_progress_bar=show_progress_bar,
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 76, in _optimize
progress_bar=progress_bar,
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 163, in _optimize_sequential
trial = _run_trial(study, func, catch)
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 264, in _run_trial
raise func_err
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "training/hparam_search.py", line 671, in objective_tf
return objective(trial)
File "training/hparam_search.py", line 660, in objective
val_loss = hps_train(trial)
File "training/hparam_search.py", line 332, in hps_train
iterator, optimizer, dropout_rates
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 195, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 133, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call
training)
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward
seed=self._seed)
contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3
time_major=time_major, name=name)
python/framework/op_def_library.py:367 _apply_op_helper
g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
python/framework/ops.py:5979 _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
python/framework/ops.py:5914 _assert_same_graph
(item, original_item))
ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).
看起来 ValueError 抱怨某些张量与另一个张量不同。但我不明白这是怎么回事,因为我每次运行都是在一个新的 Graph 上下文中开始的,所以每个张量都应该与这个新图相关联。
Optuna 版本是 2.9.1,Tensorflow 版本是 1.15.4
如果您能深入了解我在哪里出错,或者即使这是使用 Optuna 的推荐方式,我将不胜感激。非常感谢!