0

我已经修改了现有的 cifar10 示例以用作连体网络。但是我在训练它时遇到了一些困难。

所做的更改:

  • 占位符而不是队列
  • 自定义损失函数

这是我修改后的 cifar10_train.py :

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import os.path
import time
import input_data
import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

import cifar10

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('train_dir', 'tmp/cifar10_train',
                           """Directory where to write event logs """
                           """and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 1000000,
                            """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
                            """Whether to log device placement.""")


def train():
  """Train CIFAR-10 for a number of steps."""
  dataset = input_data.read()
  image, image_p, label = dataset.train_dataset
  image_size  = dataset.image_size
  batch_size = 28
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
    images2 = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
    labels = tf.placeholder(tf.float32, shape=(batch_size))
    tf.image_summary('images', images)
    tf.image_summary('images2', images)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    with tf.variable_scope('inference') as scope:
      logits = cifar10.inference(images)
      scope.reuse_variables()
      logits2 = cifar10.inference(images2)

    # Calculate loss.
    loss = cifar10.loss(logits, logits2, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      offset = (step * batch_size) % (dataset.train_samples - batch_size)
      _, loss_value = sess.run([train_op, loss], feed_dict={images: image[offset:(offset + batch_size)], images2: image_p[offset:(offset + batch_size)], labels: 1.0*label[offset:(offset + batch_size)]})
      duration = time.time() - start_time

      print(loss_value)
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)


def main(argv=None): 
 # pylint: disable=unused-argument
  train()


if __name__ == '__main__':
  tf.app.run()

修改cifar10.py

"""Builds the CIFAR-10 network.

Summary of available functions:

 # Compute input images and labels for training. If you would like to run
 # evaluations, use inputs() instead.
 inputs, labels = distorted_inputs()

 # Compute inference on the model inputs to make a prediction.
 predictions = inference(inputs)

 # Compute the total loss of the prediction with respect to the labels.
 loss = loss(predictions, labels)

 # Create a graph to run one step of training with respect to the loss.
 train_op = train(loss, global_step)
"""
# pylint: disable=missing-docstring
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip
import os
import re
import sys
import tarfile

from six.moves import urllib
import tensorflow as tf

import input_data

FLAGS = tf.app.flags.FLAGS

# Basic model parameters.
tf.app.flags.DEFINE_integer('batch_size', 28,
                            """Number of images to process in a batch.""")
tf.app.flags.DEFINE_string('data_dir_p', '/tmp/cifar10_data',
                           """Path to the CIFAR-10 data directory.""")

# Global constants describing the CIFAR-10 data set.
# IMAGE_SIZE = cifar10_input.IMAGE_SIZE
# NUM_CLASSES = cifar10_input.NUM_CLASSES
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = input_data.train_samples
# NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL


# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.001       # Initial learning rate.
Q = 360.6244
# If a model is trained with multiple GPU's prefix all Op names with tower_name
# to differentiate the operations. Note that this prefix is removed from the
# names of the summaries when visualizing a model.
TOWER_NAME = 'tower'

DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'


def _activation_summary(x):
  # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
  # session. This helps the clarity of presentation on tensorboard.
  tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
  tf.histogram_summary(tensor_name + '/activations', x)
  tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))


def _variable_on_cpu(name, shape, initializer):

  with tf.device('/cpu:0'):
    var = tf.get_variable(name, shape, initializer=initializer)
  return var


def _variable_with_weight_decay(name, shape, stddev, wd):

  var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev))
  if wd:
    weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
    tf.add_to_collection('losses', weight_decay)
  return var

def inference(data):

  # We instantiate all variables using tf.get_variable() instead of
  # tf.Variable() in order to share variables across multiple GPU training runs.
  # If we only ran this model on a single GPU, we could simplify this function
  # by replacing all instances of tf.get_variable() with tf.Variable().
  #
  # conv1
  with tf.variable_scope('conv1') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[5, 5, 1, 20],
                                         stddev=0.1, wd=0.0)

    conv = tf.nn.conv2d(data, kernel, [1, 1, 1, 1], padding='VALID')
    biases = _variable_on_cpu('biases', [20], tf.constant_initializer(0.0))
    conv1 = tf.nn.bias_add(conv, biases)
    _activation_summary(conv1)

  # pool1
  pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
                         padding='VALID', name='pool1')

  # conv2
  with tf.variable_scope('conv2') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[5, 5, 20, 50],
                                         stddev=0.1, wd=0.0)
    conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='VALID')
    biases = _variable_on_cpu('biases', [50], tf.constant_initializer(0.0))
    conv2 = tf.nn.bias_add(conv, biases)
    _activation_summary(conv2)

  # pool2
  pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='VALID', name='pool2')

  # local3
  with tf.variable_scope('local3') as scope:
    # Move everything into depth so we can perform a single matrix multiply.
    dim = 1
    for d in pool2.get_shape()[1:].as_list():
      dim *= d
    reshape = tf.reshape(pool2, [pool2.get_shape()[0:].as_list()[0], dim])

    weights = _variable_with_weight_decay('weights', shape=[dim, 500],
                                          stddev=0.1, wd=0.0)
    biases = _variable_on_cpu('biases', [500], tf.constant_initializer(0.10))
    local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
    _activation_summary(local3)

  # local4
  with tf.variable_scope('local4') as scope:
    weights = _variable_with_weight_decay('weights', shape=[500, 10],
                                          stddev=0.1, wd=0.0)
    biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.0))
    local4 = tf.add(tf.matmul(local3, weights), biases, name=scope.name)
    _activation_summary(local4)

  #local5
  with tf.variable_scope('local5') as scope:
    weights = _variable_with_weight_decay('weights', [10, 10],
                                          stddev=0.1, wd=0.0)
    biases = _variable_on_cpu('biases', [10],
                              tf.constant_initializer(0.0))
    local5 = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
    _activation_summary(local5)

  return local5


def loss(features1, features2, labels):

  energy_square = (tf.reduce_sum(tf.pow(tf.sub(features1, features2), 2),1))
  loss = tf.add(tf.mul(tf.pow(tf.sub(labels,1),2),energy_square),tf.mul(labels,tf.maximum(tf.sub(1.0,energy_square),0)))
  loss = tf.reduce_sum(loss) / features1.get_shape()[0:].as_list()[0] / 2
  # Calculate the average cross entropy loss across the batch.
  # labels = tf.cast(labels, tf.int64)
  # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      # logits, labels, name='cross_entropy_per_example')
  # cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
  tf.add_to_collection('losses', loss)

  # The total loss is defined as the cross entropy loss plus all of the weight
  # decay terms (L2 loss).
  return tf.add_n(tf.get_collection('losses'), name='total_loss')


def _add_loss_summaries(total_loss):

  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  losses = tf.get_collection('losses')
  loss_averages_op = loss_averages.apply(losses + [total_loss])

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(l.op.name +' (raw)', l)
    tf.scalar_summary(l.op.name, loss_averages.average(l))

  return loss_averages_op


def train(total_loss, global_step):
  loss_averages_op = _add_loss_summaries(total_loss)

  num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                  global_step,
                                  decay_steps,
                                  LEARNING_RATE_DECAY_FACTOR,
                                  staircase=True)
  tf.scalar_summary('learning_rate', lr)

  # Generate moving averages of all losses and associated summaries.
  loss_averages_op = _add_loss_summaries(total_loss)

  # Compute gradients.
  with tf.control_dependencies([loss_averages_op]):
    opt = tf.train.GradientDescentOptimizer(lr)
    grads = opt.compute_gradients(total_loss)


  # Apply gradients.
  apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

  # Add histograms for trainable variables.
  for var in tf.trainable_variables():
    tf.histogram_summary(var.op.name, var)

  # Add histograms for gradients.
  for grad, var in grads:
    if grad:
      tf.histogram_summary(var.op.name + '/gradients', grad)

  # Track the moving averages of all trainable variables.
  variable_averages = tf.train.ExponentialMovingAverage(
      MOVING_AVERAGE_DECAY, global_step)
  variables_averages_op = variable_averages.apply(tf.trainable_variables())

  with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
    train_op = tf.no_op(name='train')

  return train_op

我得到的错误:

2016-03-01 15:56:59.483682: step 0, loss = 0.22 (9.7 examples/sec; 2.896 sec/batch)
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Invalid argument: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [28,112,92,1]
     [[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[28,112,92,1], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary
     [[Node: HistogramSummary = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary/tag, inference/conv1/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_1
     [[Node: HistogramSummary_1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_1/tag, inference/conv1/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Invalid argument: You must feed a value for placeholder tensor 'Placeholder_2' with dtype float and shape [28]
     [[Node: Placeholder_2 = Placeholder[dtype=DT_FLOAT, shape=[28], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_3
     [[Node: HistogramSummary_3 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_3/tag, inference/conv2/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_2
     [[Node: HistogramSummary_2 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_2/tag, inference/conv2/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_4
     [[Node: HistogramSummary_4 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_4/tag, inference/local3/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_5
     [[Node: HistogramSummary_5 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_5/tag, inference/local3/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_6
     [[Node: HistogramSummary_6 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_6/tag, inference/local4/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_7
     [[Node: HistogramSummary_7 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_7/tag, inference/local4/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_8
     [[Node: HistogramSummary_8 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_8/tag, inference/local5/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_9
     [[Node: HistogramSummary_9 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_9/tag, inference/local5/biases/read)]]
Traceback (most recent call last):
  File "cifar10_train.py", line 110, in <module>
    tf.app.run()
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/platform/default/_app.py", line 30, in run
    sys.exit(main(sys.argv))
  File "cifar10_train.py", line 106, in main
    train()
  File "cifar10_train.py", line 95, in train
    summary_str = sess.run(summary_op)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 315, in run
    return self._run(None, fetches, feed_dict)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 511, in _run
    feed_dict_string)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 564, in _do_run
    target_list)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 586, in _do_call
    e.code)
tensorflow.python.framework.errors.InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [28,112,92,1]
     [[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[28,112,92,1], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op u'Placeholder', defined at:
  File "cifar10_train.py", line 110, in <module>
    tf.app.run()
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/platform/default/_app.py", line 30, in run
    sys.exit(main(sys.argv))
  File "cifar10_train.py", line 106, in main
    train()
  File "cifar10_train.py", line 36, in train
    images = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 742, in placeholder
    name=name)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 583, in _placeholder
    name=name)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
    op_def=op_def)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2040, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1087, in __init__
    self._traceback = _extract_stack()

此外,当我注释掉 merge_all_summaries() 时,模型与 loss= NaN 不同

4

1 回答 1

1

这里的问题是,您的图表中的一些摘要(由以下人员收集tf.merge_all_summaries()取决于您的占位符。例如,中的代码cifar10.py为每个步骤的各种激活创建摘要,这取决于所使用的训练示例。

解决方案是在评估时提供相同的训练批次summary_op

if step % 100 == 0:
  summary_str = sess.run(summary_op, feed_dict={
      images: image[offset:(offset + batch_size)],
      images2: image_p[offset:(offset + batch_size)],
      labels: 1.0 * label[offset:(offset + batch_size)]})

虽然这对您的原始代码进行了最小的修改,但它的效率略低,因为它将每 100 步重新执行一次训练步骤。解决此问题的最佳方法(尽管它需要对您的训练循环进行一些重组)是在sess.run()执行训练步骤的同一调用中获取摘要:

if step % 100 == 0:
  _, loss_value, summary_str = sess.run([train_op, loss, summary_op], feed_dict={
      images: image[offset:(offset + batch_size)],
      images2: image_p[offset:(offset + batch_size)],
      labels: 1.0 * label[offset:(offset + batch_size)]})
于 2016-03-01T15:03:13.307 回答