2

运行笔记本时出现以下错误:

InvalidArgumentErrorTraceback (most recent call last)
    <ipython-input-77-e16e0fd6d275> in <module>()
    ----> 1 tpu_ops = tf.contrib.tpu.batch_parallel(run_find_closest_latent_vector, [], num_shards=8)
      2 
      3 def run_once():
      4   session_tpu.run(tpu_ops)
      5 

    /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.pyc in batch_parallel(computation, inputs, num_shards, infeed_queue, device_assignment, name)
    981       infeed_queue=infeed_queue,
    982       device_assignment=device_assignment,
    --> 983       name=name)
    984 
    985 

    /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.pyc in shard(computation, inputs, num_shards, input_shard_axes, outputs_from_all_shards, output_shard_axes, infeed_queue, device_assignment, name)
    879       infeed_queue=infeed_queue,
    880       device_assignment=device_assignment,
    --> 881       name=name)
    882 
    883   # There must be at least one shard since num_shards > 0.

    /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.pyc in replicate(computation, inputs, infeed_queue, device_assignment, name)
    505   """
    506   return split_compile_and_replicate(computation, inputs, infeed_queue,
    --> 507                                      device_assignment, name)[1]
    508 
    509 

    /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.pyc in split_compile_and_replicate(***failed resolving arguments***)
    682       vscope.set_custom_getter(custom_getter)
    683 
    --> 684       outputs = computation(*computation_inputs)
    685 
    686       vscope.set_use_resource(saved_use_resource)

    <ipython-input-76-66eb3bb2ffa2> in run_find_closest_latent_vector()
     34 
     35 def run_find_closest_latent_vector():
    ---> 36   result = find_closest_latent_vector(num_optimization_steps=40)
     37   display_images(result[0], [("Loss: %.2f" % loss) for loss in result[1]])
     38 

    <ipython-input-76-66eb3bb2ffa2> in find_closest_latent_vector(num_optimization_steps)
     22 
     23     with tf.Session(tpu_address) as session_tpu:
    ---> 24       session_tpu.run(tf.global_variables_initializer())
     25       optimizer = tf.train.AdamOptimizer(learning_rate=0.3)
     26       train = optimizer.minimize(loss)

    /usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    927     try:
    928       result = self._run(None, fetches, feed_dict, options_ptr,
    --> 929                          run_metadata_ptr)
    930       if run_metadata:
    931         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

    /usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    1150     if final_fetches or final_targets or (handle and feed_dict_tensor):
    1151       results = self._do_run(handle, final_targets, final_fetches,
    -> 1152                              feed_dict_tensor, options, run_metadata)
    1153     else:
    1154       results = []

    /usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
    1326     if handle is None:
    1327       return self._do_call(_run_fn, feeds, fetches, targets, options,
    -> 1328                            run_metadata)
    1329     else:
    1330       return self._do_call(_prun_fn, handle, feeds, fetches)

    /usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
    1346           pass
    1347       message = error_interpolation.interpolate(message, self._graph)
    -> 1348       raise type(e)(node_def, op, message)
    1349 
    1350   def _extend_graph(self):

    InvalidArgumentError: Unsuccessful TensorSliceReader constructor: Failed to get matching files on /tmp/tfhub_modules/2f9e2f0be115550c7ae9b90bb71b29e76fa404d8/variables/variables: Unimplemented: File system scheme '[local]' not implemented (file: '/tmp/tfhub_modules/2f9e2f0be115550c7ae9b90bb71b29e76fa404d8/variables/variables')
    [[node checkpoint_initializer_9 (defined at /usr/local/lib/python2.7/dist-packages/tensorflow_hub/native_module.py:395)  = RestoreV2[dtypes=[DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](checkpoint_initializer/prefix, checkpoint_initializer_9/tensor_names, checkpoint_initializer/shape_and_slices)]]

不知怎么Unimplemented: File system scheme '[local]' not implemented的,可能与这个问题有关。

我无法直接访问文件的路径,以便\在需要的地方添加末尾。

这是笔记本中的代码块:

    def _get_beta_accumulators(self):
        return self._beta1_power, self._beta2_power

    def find_closest_latent_vector(num_optimization_steps):
      images = []
      losses = []
      with tf.Graph().as_default():
        module = hub.Module("https://tfhub.dev/google/progan-128/1")

        initial_vector = tf.random_normal([1, latent_dim], seed=5)

        vector = tf.get_variable("vector", initializer=initial_vector)
        image = module(vector)

        target_image_difference = tf.reduce_sum(
            tf.losses.absolute_difference(image[0], target_image[:,:,:3]))

        # The latent vectors were sampled from a normal distribution. We can get
        # more realistic images if we regularize the length of the latent vector to 
        # the average length of vector from this distribution.
        regularizer = tf.abs(tf.norm(vector) - np.sqrt(latent_dim))

        loss = target_image_difference + regularizer

        with tf.Session(tpu_address) as session_tpu:
          session_tpu.run(tf.global_variables_initializer())
          optimizer = tf.train.AdamOptimizer(learning_rate=0.3)
          train = optimizer.minimize(loss)

          for _ in range(num_optimization_steps):
            _, loss_out, im_out = session_tpu.run([train, loss, image])
            print(im_out[0])
            losses.append(loss_out)
            print loss_out
        return images, losses

    def run_find_closest_latent_vector():
      result = find_closest_latent_vector(num_optimization_steps=40)
      display_images(result[0], [("Loss: %.2f" % loss) for loss in result[1]])

希望这将有助于调试问题。

有什么想法吗?

4

2 回答 2

8

我相信您的根本问题是您将本地机器用于输入、模型或检查点目的。但是,根据this doc

所有输入文件和模型目录必须使用云存储桶路径(gs://bucket-name/...),并且该桶必须可从 TPU 服务器访问。请注意,所有数据处理和模型检查点都是在 TPU 服务器上执行的,而不是本地机器上。

于 2018-12-17T20:41:23.590 回答
1

您可以使用笔记本单元格中的相关代码编辑问题吗?我知道您的笔记本是私人的,但也许您可以只共享一个单元格来帮助我们调试您的问题。

于 2018-12-17T20:17:22.390 回答