0

我正在尝试使用一个小的自定义数据集(总共大约 4k 图像和 2.5k 用于训练)来尝试这个网络(该 repo 是在其上实现的,并且与官方 StyleGAN2 repo 具有相同的依赖关系)。

运行时python run_training.py --data-dir=<>d --result-dir=<> --dataset="train" --num-gpus=1 --total-kimg=10000 --mirror-augment=True,出现以下错误:

Local submit - run_dir: /content/drive/MyDrive/co-mod-gan/results/00006-co-mod-gan-train_all-1gpu
dnnlib: Running training.training_loop.training_loop() on localhost...
Streaming data using training.dataset.TFRecordDataset...
tcmalloc: large alloc 4294967296 bytes == 0x562d81b88000 @  0x7f9c7abf2001 0x7f9c776d654f 0x7f9c77726b58 0x7f9c7772ab17 0x7f9c777c9203 0x562d79b9c424 0x562d79b9c120 0x562d79c10b80 0x562d79c0b66e 0x562d79b9e36c 0x562d79bdf7b9 0x562d79bdc6d4 0x562d79b9e571 0x562d79c0d633 0x562d79c0b02f 0x562d79adce2b 0x562d79c0d633 0x562d79c0b66e 0x562d79adce2b 0x562d79c0d633 0x562d79b9d9da 0x562d79c0beae 0x562d79b9d9da 0x562d79c0c108 0x562d79c0b02f 0x562d79adce2b 0x562d79c0d633 0x562d79c0b02f 0x562d79adce2b 0x562d79c0d633 0x562d79b9d9da
tcmalloc: large alloc 4294967296 bytes == 0x562e81b88000 @  0x7f9c7abf01e7 0x7f9c776d646e 0x7f9c77726c7b 0x7f9c7772735f 0x7f9c777c9103 0x562d79b9c424 0x562d79b9c120 0x562d79c10b80 0x562d79c0b02f 0x562d79b9daba 0x562d79c0ccd4 0x562d79c0b02f 0x562d79b9daba 0x562d79c0ccd4 0x562d79c0b02f 0x562d79b9daba 0x562d79c0ccd4 0x562d79b9d9da 0x562d79c0beae 0x562d79c0b02f 0x562d79b9daba 0x562d79c102c0 0x562d79c0b02f 0x562d79b9daba 0x562d79c0ccd4 0x562d79c0b66e 0x562d79b9e36c 0x562d79bdf7b9 0x562d79bdc6d4 0x562d79b9e571 0x562d79c0d633
tcmalloc: large alloc 4294967296 bytes == 0x562f834ea000 @  0x7f9c7abf01e7 0x7f9c776d646e 0x7f9c77726c7b 0x7f9c7772735f 0x7f9c22441235 0x7f9c21dc4792 0x7f9c21dc4d42 0x7f9c21d7daee 0x562d79b9c317 0x562d79b9c120 0x562d79c10679 0x562d79b9d9da 0x562d79c0c108 0x562d79c0b1c0 0x562d79adceb0 0x562d79c0d633 0x562d79c0b02f 0x562d79b9daba 0x562d79c0c108 0x562d79c0b66e 0x562d79b9daba 0x562d79c0c108 0x562d79b9d9da 0x562d79c0c108 0x562d79c0b02f 0x562d79b9e151 0x562d79b9e571 0x562d79c0d633 0x562d79c0b02f 0x562d79b9daba 0x562d79c0beae
Dataset shape = [3, 512, 512]
Dynamic range = [0, 255]
Label size    = 0
Traceback (most recent call last):
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)


 tensorflow.python.framework.errors_impl.OutOfRangeError: End of sequence
     [[{{node Dataset_1/IteratorGetNext}}]]



During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "run_training.py", line 133, in <module>
    main()
  File "run_training.py", line 128, in main
    run(**vars(args))
  File "run_training.py", line 71, in run
    dnnlib.submit_run(**kwargs)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/submission/submit.py", line 343, in submit_run
    return farm.submit(submit_config, host_run_dir)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/submission/internal/local.py", line 22, in submit
    return run_wrapper(submit_config)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/submission/submit.py", line 280, in run_wrapper
    run_func_obj(**submit_config.run_func_kwargs)
  File "/content/drive/MyDrive/co-mod-gan/training/training_loop.py", line 142, in training_loop
    grid_size, grid_reals, grid_labels, grid_masks = misc.setup_snapshot_image_grid(training_set, **grid_args)
  File "/content/drive/MyDrive/co-mod-gan/training/misc.py", line 123, in setup_snapshot_image_grid
    reals[:], labels[:] = training_set.get_minibatch_val_np(gw * gh)
  File "/content/drive/MyDrive/co-mod-gan/training/dataset.py", line 189, in get_minibatch_val_np
    return tflib.run(self._tf_minibatch_val_np)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/tflib/tfutil.py", line 31, in run
    return tf.get_default_session().run(*args, **kwargs)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)


tensorflow.python.framework.errors_impl.OutOfRangeError: End of sequence
     [[node Dataset_1/IteratorGetNext (defined at /tensorflow-1.15.2/python3.7/tensorflow_core/python/framework/ops.py:1748) ]]


Original stack trace for 'Dataset_1/IteratorGetNext':
  File "run_training.py", line 133, in <module>
    main()
  File "run_training.py", line 128, in main
    run(**vars(args))
  File "run_training.py", line 71, in run
    dnnlib.submit_run(**kwargs)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/submission/submit.py", line 343, in submit_run
    return farm.submit(submit_config, host_run_dir)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/submission/internal/local.py", line 22, in submit
    return run_wrapper(submit_config)
  File "/content/drive/MyDrive/co-mod-gan/dnnlib/submission/submit.py", line 280, in run_wrapper
    run_func_obj(**submit_config.run_func_kwargs)
  File "/content/drive/MyDrive/co-mod-gan/training/training_loop.py", line 142, in training_loop
    grid_size, grid_reals, grid_labels, grid_masks = misc.setup_snapshot_image_grid(training_set, **grid_args)
  File "/content/drive/MyDrive/co-mod-gan/training/misc.py", line 123, in setup_snapshot_image_grid
    reals[:], labels[:] = training_set.get_minibatch_val_np(gw * gh)
  File "/content/drive/MyDrive/co-mod-gan/training/dataset.py", line 188, in get_minibatch_val_np
    self._tf_minibatch_val_np = self.get_minibatch_val_tf()
  File "/content/drive/MyDrive/co-mod-gan/training/dataset.py", line 174, in get_minibatch_val_tf
    return self._tf_val_iterator.get_next()
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/data/ops/iterator_ops.py", line 426, in get_next
    name=name)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/ops/gen_dataset_ops.py", line 2518, in iterator_get_next
    output_shapes=output_shapes, name=name)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

该网络是在 Tensorflow 1.15 上设计的,但 Google Colab 现在只提供 1.15.2 :(

你能帮我找出我做错了什么吗?

感谢您的任何帮助和新年快乐:)

4

0 回答 0