我正在尝试使用谷歌云的 TPU 训练我的模型。该模型在 CPU 和 GPU 上运行良好,我可以毫无问题地运行 TPU 教程(因此连接到 TPU 不是问题)。然而,当我在 TPU 云上运行我的程序时,我得到了一个错误。最重要的一行可能如下:
NotImplementedError: Non-resource Variables are not supported inside TPU computations (operator name: training_op/update_2nd_caps/primary_to_first_fc/W/ApplyAdam/RefEnter)
如果那里有重要的东西,这是完整的错误:
Traceback (most recent call last):
File "TPU_playground.py", line 85, in <module>
capser.train(input_fn=train_input_fn_tpu, steps=n_steps)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 366, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1119, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1132, in _train_model_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1992, in _call_model_fn
features, labels, mode, config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1107, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2223, in _model_fn
_train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2537, in _train_on_tpu_system
device_assignment=ctx.device_assignment)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 733, in shard
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 394, in replicate
device_assignment, name)[1]
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 546, in split_compile_and_replicate
outputs = computation(*computation_inputs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2530, in multi_tpu_train_steps_on_single_shard
single_tpu_train_step, [_INITIAL_LOSS])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 207, in repeat
cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 169, in while_loop
name="")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 3209, in while_loop
result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2941, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2878, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 120, in body_wrapper
outputs = body(*(inputs + dequeue_ops))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 203, in body_wrapper
return [i + 1] + _convert_to_list(body(*args))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1166, in train_step
self._call_model_fn(features, labels))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1337, in _call_model_fn
estimator_spec = self._model_fn(features=features, **kwargs)
File "/home/adrien_doerig/capser/capser_7_model_fn.py", line 100, in model_fn_tpu
**output_decoder_deconv_params)
File "/home/adrien_doerig/capser/capser_model.py", line 341, in capser_model
loss_training_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step(), name="training_op")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 409, in minimize
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py", line 114, in apply_gradients
return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 602, in apply_gradients
update_ops.append(processor.update_op(self, grad))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 113, in update_op
update_op = optimizer._apply_dense(g, self._v) # pylint: disable=protected-access
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/adam.py", line 148, in _apply_dense
grad, use_locking=self._use_locking).op
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/gen_training_ops.py", line 293, in apply_adam
use_nesterov=use_nesterov, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 3414, in create_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1782, in __init__
self._control_flow_post_processing()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1793, in _control_flow_post_processing
self._control_flow_context.AddOp(self)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2430, in AddOp
self._AddOpInternal(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2451, in _AddOpInternal
real_x = self.AddValue(x)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2398, in AddValue
self._outer_context.AddInnerOp(enter.op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 310, in AddInnerOp
self._AddOpInternal(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 287, in _AddOpInternal
"(operator name: %s)" % op.name)
NotImplementedError: Non-resource Variables are not supported inside TPU computations (operator name: training_op/update_2nd_caps/primary_to_first_fc/W/ApplyAdam/RefEnter)
图的前向传播似乎构建得很好,但在这种情况下,TPU 不支持使用 AdamOptimizer 的反向传播。我尝试使用更多标准优化器(GradientDescentOptimizer 和 MomentumOptimizer),但没有帮助。前馈传递中的所有张量都采用与 TPU 兼容的格式(即 tf.float32)。
有人对我应该尝试什么有建议吗?
谢谢!