您知道如何在 GPU 上运行“eval_image_classifier.py”吗?我应该更改任何功能或进行任何修改吗?或者是否存在任何其他特定功能用于在 GPU 上进行评估?
我已经可以在 GPU 上运行“train_image_classifier.py”,因为具有用于在 CPU 和 GPU 之间切换的相关标志:
tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
'Use CPUs to deploy clones.')
我确实尝试将同一行添加到eval_image_classifier.py
,但没有效果。我正在使用 Python 2.7.13 和 Tensorflow 1.3.0 。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
from deployment import model_deploy
from datasets import dataset_factory
from nets import nets_factory
from preprocessing import preprocessing_factory
slim = tf.contrib.slim
tf.app.flags.DEFINE_integer(
'batch_size', 32, 'The number of samples in each batch.')
tf.app.flags.DEFINE_integer(
'max_num_batches', None,
'Max number of batches to evaluate by default use all.')
tf.app.flags.DEFINE_string(
'master', '', 'The address of the TensorFlow master to use.')
tf.app.flags.DEFINE_string(
'checkpoint_path', '...',
'The directory where the model was written to or an absolute path to a '
'checkpoint file.')
tf.app.flags.DEFINE_string(
'eval_dir', '...',
'Directory where the results are saved to.')
tf.app.flags.DEFINE_integer('num_clones', 1,
'Number of model clones to deploy.')
tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
'Use CPUs to deploy clones.')
tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.')
tf.app.flags.DEFINE_integer(
'num_readers', 4,
'The number of parallel readers that read data from the dataset.')
tf.app.flags.DEFINE_integer(
'num_ps_tasks', 0,
'The number of parameter servers. If the value is 0, then the parameters '
'are handled locally by the worker.')
tf.app.flags.DEFINE_integer(
'num_preprocessing_threads', 4,
'The number of threads used to create the batches.')
tf.app.flags.DEFINE_string(
'dataset_name', '...', 'The name of the dataset to load.')
tf.app.flags.DEFINE_string(
'dataset_split_name', 'validation', 'The name of the train/test split.')
tf.app.flags.DEFINE_string(
'dataset_dir', '...',
'The directory where the dataset files are stored.')
tf.app.flags.DEFINE_integer(
'labels_offset', 0,
'An offset for the labels in the dataset. This flag is primarily used to '
'evaluate the VGG and ResNet architectures which do not use a background '
'class for the ImageNet dataset.')
tf.app.flags.DEFINE_string(
'model_name', 'densenet161', 'The name of the architecture to evaluate.')
tf.app.flags.DEFINE_string(
'preprocessing_name', None, 'The name of the preprocessing to use. If left '
'as `None`, then the model_name flag is used.')
tf.app.flags.DEFINE_float(
'moving_average_decay', None,
'The decay to use for the moving average.'
'If left as None, then moving averages are not used.')
tf.app.flags.DEFINE_integer(
'eval_image_size', None, 'Eval image size')
FLAGS = tf.app.flags.FLAGS
def main(_):
if not FLAGS.dataset_dir:
raise ValueError('You must supply the dataset directory with --dataset_dir')
#######################
# Config model_deploy #
#######################
tf.logging.set_verbosity(tf.logging.INFO)
with tf.Graph().as_default():
deploy_config = model_deploy.DeploymentConfig(
num_clones=FLAGS.num_clones,
clone_on_cpu=FLAGS.clone_on_cpu,
#replica_id=FLAGS.task,
num_replicas=FLAGS.worker_replicas,
num_ps_tasks=FLAGS.num_ps_tasks)
# Create global_step
with tf.device(deploy_config.variables_device()):
tf_global_step = slim.create_global_step()
######################
# Select the dataset #
######################
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
####################
# Select the model #
####################
network_fn = nets_factory.get_network_fn(
FLAGS.model_name,
num_classes=(dataset.num_classes - FLAGS.labels_offset),
is_training=False)
##############################################################
# Create a dataset provider that loads data from the dataset #
##############################################################
with tf.device(deploy_config.inputs_device()):
provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
num_readers=FLAGS.num_readers,
shuffle=False,
common_queue_capacity=2 * FLAGS.batch_size,
common_queue_min=FLAGS.batch_size)
[image, label] = provider.get(['image', 'label'])
label -= FLAGS.labels_offset
#####################################
# Select the preprocessing function #
#####################################
preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
image_preprocessing_fn = preprocessing_factory.get_preprocessing(
preprocessing_name,
is_training=False)
eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size
image = image_preprocessing_fn(image, eval_image_size, eval_image_size)
images, labels = tf.train.batch(
[image, label],
batch_size=FLAGS.batch_size,
num_threads=FLAGS.num_preprocessing_threads,
capacity=5 * FLAGS.batch_size)
batch_queue = slim.prefetch_queue.prefetch_queue(
[images, labels], capacity=2 * deploy_config.num_clones)
####################
# Define the model #
####################
def clone_fn(batch_queue):
"""Allows data parallelism by creating multiple clones of network_fn."""
with tf.device(deploy_config.inputs_device()):
images, labels = batch_queue.dequeue()
logits, end_points = network_fn(images)
logits = tf.squeeze(logits)
#############################
# Specify the loss function #
#############################
if 'AuxLogits' in end_points:
tf.losses.mean_squared_error(
predictions=end_points['AuxLogits'], labels=labels, weights=0.4, scope='aux_loss')
tf.losses.mean_squared_error(
predictions=logits, labels=labels, weights=1.0)
return end_points
#clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
#first_clone_scope = deploy_config.clone_scope(0)
####################
# Define the model #
####################
logits, _ = network_fn(images)
if FLAGS.moving_average_decay:
variable_averages = tf.train.ExponentialMovingAverage(
FLAGS.moving_average_decay, tf_global_step)
variables_to_restore = variable_averages.variables_to_restore(
slim.get_model_variables())
variables_to_restore[tf_global_step.op.name] = tf_global_step
else:
variables_to_restore = slim.get_variables_to_restore()
logits = tf.squeeze(logits)
# Define the metrics:
predictions = logits
names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
'Accuracy': tf.metrics.root_mean_squared_error(predictions, labels),
'Recall_5': slim.metrics.streaming_recall(
logits, labels),
})
# Print the summaries to screen.
print_ops = []
summary_ops = []
for name, value in names_to_values.items():
summary_name = 'eval/%s' % name
op = tf.summary.scalar(summary_name, value, collections=[])
op = tf.Print(op, [value], summary_name)
summary_ops.append(op)
print_ops.append(tf.Print(value, [value], summary_name))
tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
# TODO(sguada) use num_epochs=1
if FLAGS.max_num_batches:
num_batches = FLAGS.max_num_batches
else:
# This ensures that we make a single pass over all of the data.
num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))
if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
if tf.train.latest_checkpoint(FLAGS.checkpoint_path):
checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
else:
checkpoint_path = FLAGS.checkpoint_path
eval_interval_secs = 6
tf.logging.info('Evaluating %s' % checkpoint_path)
slim.evaluation.evaluation_loop(
master=FLAGS.master,
checkpoint_dir=checkpoint_path,
logdir=FLAGS.eval_dir,
num_evals=num_batches,
eval_op=list(names_to_updates.values()) + print_ops,
variables_to_restore=variables_to_restore,
eval_interval_secs = eval_interval_secs )
if __name__ == '__main__':
tf.app.run()
我也尝试使用一些代码,例如 Tensorflow 教程:
# Creates a graph.
with tf.device('/gpu:2'):
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with allow_soft_placement and log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True))
# Runs the op.
print(sess.run(c))
我以这种方式修改了代码:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
from datasets import dataset_factory
from nets import nets_factory
from preprocessing import preprocessing_factory
slim = tf.contrib.slim
tf.app.flags.DEFINE_integer(
'batch_size', 32, 'The number of samples in each batch.')
tf.app.flags.DEFINE_integer(
'max_num_batches', None,
'Max number of batches to evaluate by default use all.')
tf.app.flags.DEFINE_string(
'master', '', 'The address of the TensorFlow master to use.')
tf.app.flags.DEFINE_string(
'checkpoint_path', '...',
'The directory where the model was written to or an absolute path to a '
'checkpoint file.')
tf.app.flags.DEFINE_string(
'eval_dir', '...',
'Directory where the results are saved to.')
tf.app.flags.DEFINE_integer(
'num_preprocessing_threads', 4,
'The number of threads used to create the batches.')
tf.app.flags.DEFINE_string(
'dataset_name', '...', 'The name of the dataset to load.')
tf.app.flags.DEFINE_string(
'dataset_split_name', 'validation', 'The name of the train/test split.')
tf.app.flags.DEFINE_string(
'dataset_dir', '...',
'The directory where the dataset files are stored.')
tf.app.flags.DEFINE_integer(
'labels_offset', 0,
'An offset for the labels in the dataset. This flag is primarily used to '
'evaluate the VGG and ResNet architectures which do not use a background '
'class for the ImageNet dataset.')
tf.app.flags.DEFINE_string(
'model_name', 'densenet161', 'The name of the architecture to evaluate.')
tf.app.flags.DEFINE_string(
'preprocessing_name', None, 'The name of the preprocessing to use. If left '
'as `None`, then the model_name flag is used.')
tf.app.flags.DEFINE_float(
'moving_average_decay', None,
'The decay to use for the moving average.'
'If left as None, then moving averages are not used.')
tf.app.flags.DEFINE_integer(
'eval_image_size', None, 'Eval image size')
FLAGS = tf.app.flags.FLAGS
# Initialize all global and local variables
init = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
def main(_):
if not FLAGS.dataset_dir:
raise ValueError('You must supply the dataset directory with --dataset_dir')
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
log_device_placement=True))
with tf.Graph().as_default(), tf.device('/gpu:0'):
sess.run(init)
tf_global_step = slim.get_or_create_global_step()
######################
# Select the dataset #
######################
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
####################
# Select the model #
####################
network_fn = nets_factory.get_network_fn(
FLAGS.model_name,
num_classes=(dataset.num_classes - FLAGS.labels_offset),
is_training=False)
##############################################################
# Create a dataset provider that loads data from the dataset #
##############################################################
provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
shuffle=False,
common_queue_capacity=2 * FLAGS.batch_size,
common_queue_min=FLAGS.batch_size)
[image, label] = provider.get(['image', 'label'])
label -= FLAGS.labels_offset
#####################################
# Select the preprocessing function #
#####################################
preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
image_preprocessing_fn = preprocessing_factory.get_preprocessing(
preprocessing_name,
is_training=False)
eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size
image = image_preprocessing_fn(image, eval_image_size, eval_image_size)
images, labels = tf.train.batch(
[image, label],
batch_size=FLAGS.batch_size,
num_threads=FLAGS.num_preprocessing_threads,
capacity=5 * FLAGS.batch_size)
####################
# Define the model #
####################
logits, _ = network_fn(images)
if FLAGS.moving_average_decay:
variable_averages = tf.train.ExponentialMovingAverage(
FLAGS.moving_average_decay, tf_global_step)
variables_to_restore = variable_averages.variables_to_restore(
slim.get_model_variables())
variables_to_restore[tf_global_step.op.name] = tf_global_step
else:
variables_to_restore = slim.get_variables_to_restore()
logits = tf.squeeze(logits)
# Define the metrics:
predictions = logits
names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
'Accuracy': tf.metrics.root_mean_squared_error(predictions, labels),
'Recall_5': slim.metrics.streaming_recall(
logits, labels),
})
# Print the summaries to screen.
print_ops = []
summary_ops = []
for name, value in names_to_values.items():
summary_name = 'eval/%s' % name
op = tf.summary.scalar(summary_name, value, collections=[])
op = tf.Print(op, [value], summary_name)
summary_ops.append(op)
print_ops.append(tf.Print(value, [value], summary_name))
tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
# TODO(sguada) use num_epochs=1
if FLAGS.max_num_batches:
num_batches = FLAGS.max_num_batches
else:
# This ensures that we make a single pass over all of the data.
num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))
if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
if tf.train.latest_checkpoint(FLAGS.checkpoint_path):
checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
else:
checkpoint_path = FLAGS.checkpoint_path
#print(checkpoint_path)
eval_interval_secs = 6
tf.logging.info('Evaluating %s' % checkpoint_path)
slim.evaluation.evaluation_loop(
master=FLAGS.master,
checkpoint_dir=checkpoint_path,
logdir=FLAGS.eval_dir,
num_evals=num_batches,
eval_op=list(names_to_updates.values()) + print_ops,
variables_to_restore=variables_to_restore,
eval_interval_secs = eval_interval_secs )
if __name__ == '__main__':
tf.app.run()
当我运行这段代码时,我遇到了这个错误:
Traceback (most recent call last):
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 210, in <module>
tf.app.run()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 206, in main
eval_interval_secs = 60 )
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/slim/python/slim/evaluation.py", line 296, in evaluation_loo
p
timeout=timeout)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/training/python/training/evaluation.py", line 447, in evalua
te_repeatedly
session_creator=session_creator, hooks=hooks) as session:
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 668, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 490, in __init__
self._sess = _RecoverableSession(self._coordinated_creator)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 842, in __init__
_WrappedSession.__init__(self, self._create_session())
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 847, in _create_session
return self._sess_creator.create_session()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 551, in create_session
self.tf_sess = self._session_creator.create_session()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 425, in create_session
init_fn=self._scaffold.init_fn)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/session_manager.py", line 273, in prepare_session
config=config)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/session_manager.py", line 189, in _restore_checkpoin
t
saver.restore(sess, checkpoint_filename_with_path)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1560, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1124, in _run
feed_dict_tensor, options, run_metadata)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
options, run_metadata)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot assign a device for operation 'eval_step': Could not satisfy explicit device s
pecification '/device:GPU:0' because no supported kernel for GPU devices is available.
Colocation Debug Info:
Colocation group had the following types and devices:
Const: GPU CPU
AssignAdd: CPU
VariableV2: CPU
Identity: GPU CPU
Assign: CPU
IsVariableInitialized: CPU
[[Node: eval_step = VariableV2[_class=["loc:@eval_step"], container="", dtype=DT_INT64, shape=[], shared_name="", _device="/device:GPU:0"]
()]]
Caused by op u'eval_step', defined at:
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 210, in <module>
tf.app.run()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 206, in main
eval_interval_secs = 60 )
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/slim/python/slim/evaluation.py", line 296, in evaluation_loo
p
timeout=timeout)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/training/python/training/evaluation.py", line 410, in evalua
te_repeatedly
eval_step = get_or_create_eval_step()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/evaluation.py", line 57, in _get_or_create_eval_step
collections=[ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.EVAL_STEP])
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
validate_shape=validate_shape, use_resource=use_resource)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
use_resource=use_resource)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 725, in _get_single_variable
validate_shape=validate_shape)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 199, in __init__
expected_shape=expected_shape)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 283, in _init_from_args
name=name)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/state_ops.py", line 131, in variable_op_v2
shared_name=shared_name)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/gen_state_ops.py", line 682, in _variable_v2
name=name)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'eval_step': Could not satisfy explicit device specification '
/device:GPU:0' because no supported kernel for GPU devices is available.
Colocation Debug Info:
Colocation group had the following types and devices:
Const: GPU CPU
AssignAdd: CPU
VariableV2: CPU
Identity: GPU CPU
Assign: CPU
IsVariableInitialized: CPU
[[Node: eval_step = VariableV2[_class=["loc:@eval_step"], container="", dtype=DT_INT64, shape=[], shared_name="", _device="/device:GPU:0"]
()]]
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'report_uninitialized_variables_1/boolean_mask/Gather:0' shape=(?,) dtype=string>
If you want to mark it as used call its "mark_used()" method.