2

我正在 TensorFlow 中实现 SegNet,我用它来将航拍图像分割成两个类:“Building”和“Not building”。我有一个小版本的网络,它的准确率高达 82% mIoU。

但是,我想通过添加多个卷积层来扩展网络,就像原来的 SegNet 一样,但我无法让它工作。

这就是我实现工作的小模型的方式:

def inference_basic(images, phase_train, batch_size, keep_prob):

  conv1 = conv_layer_with_bn(norm1, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1")
  pool1, pool1_indices = tf.nn.max_pool_with_argmax(conv1, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='SAME', name='pool1')

  conv2 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2")
  pool2, pool2_indices = tf.nn.max_pool_with_argmax(conv2, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')

  conv3 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3")
  pool3, pool3_indices = tf.nn.max_pool_with_argmax(conv3, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='SAME', name='pool3')

  conv4 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4")
  pool4, pool4_indices = tf.nn.max_pool_with_argmax(conv4, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='SAME', name='pool4')

  """ End of encoder """

  """ start decoder """

  upsample4 = deconv_layer(pool4, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4")
  conv_decode4 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, False, name="conv_decode4")

  upsample3= deconv_layer(conv_decode4, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3")
  conv_decode3 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, False, name="conv_decode3")

  upsample2= deconv_layer(conv_decode3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2")
  conv_decode2 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, False, name="conv_decode2")

  upsample1= deconv_layer(conv_decode2, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1")
  conv_decode1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, False, name="conv_decode1")
  """ end of decoder """

  """ Start Classify """
  with tf.variable_scope('conv_classifier') as scope:
    kernel = _variable_with_weight_decay('weights',
                                         shape=[1, 1, 64, FLAGS.num_class],
                                         initializer=msra_initializer(1, 64),
                                         wd=0.0005)
    conv = tf.nn.conv2d(conv_decode1, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0))
    conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name)
  return conv_classifier

这是扩展模型,结果非常糟糕:

def inference(images, phase_train, batch_size):
  conv1_1 = conv_layer_with_bn(images, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1_1")
  conv1_2 = conv_layer_with_bn(conv1_1, [7, 7, 64, 64], phase_train, name="conv1_2")
  pool1, pool1_indices = tf.nn.max_pool_with_argmax(conv1_2, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME', name='pool1')

  conv2_1 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2_1")
  conv2_2 = conv_layer_with_bn(conv2_1, [7, 7, 64, 64], phase_train, name="conv2_2")
  pool2, pool2_indices = tf.nn.max_pool_with_argmax(conv2_2, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding='SAME', name='pool2')

  conv3_1 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3_1")
  conv3_2 = conv_layer_with_bn(conv3_1, [7, 7, 64, 64], phase_train, name="conv3_2")
  conv3_3 = conv_layer_with_bn(conv3_2, [7, 7, 64, 64], phase_train, name="conv3_3")
  pool3, pool3_indices = tf.nn.max_pool_with_argmax(conv3_3, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding='SAME', name='pool3')

  conv4_1 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4_1")
  conv4_2 = conv_layer_with_bn(conv4_1, [7, 7, 64, 64], phase_train, name="conv4_2")
  conv4_3 = conv_layer_with_bn(conv4_2, [7, 7, 64, 64], phase_train, name="conv4_3")
  pool4, pool4_indices = tf.nn.max_pool_with_argmax(conv4_3, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding='SAME', name='pool4')

  conv5_1 = conv_layer_with_bn(pool4, [7, 7, 64, 64], phase_train, name="conv5_1")
  conv5_2 = conv_layer_with_bn(conv5_1, [7, 7, 64, 64], phase_train, name="conv5_2")
  conv5_3 = conv_layer_with_bn(conv5_2, [7, 7, 64, 64], phase_train, name="conv5_3")
  pool5, pool5_indices = tf.nn.max_pool_with_argmax(conv5_3, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding='SAME', name='pool5')
  """ End of encoder """

  """ Start decoder """
  upsample5 = deconv_layer(pool5, [2, 2, 64, 64], [batch_size, FLAGS.image_h//16, FLAGS.image_w//16, 64], 2, "up5")
  conv_decode5_1 = conv_layer_with_bn(upsample5, [7, 7, 64, 64], phase_train, True, name="conv_decode5_1")
  conv_decode5_2 = conv_layer_with_bn(conv_decode5_1, [7, 7, 64, 64], phase_train, True, name="conv_decode5_2")
  conv_decode5_3 = conv_layer_with_bn(conv_decode5_2, [7, 7, 64, 64], phase_train, True, name="conv_decode5_3")

  upsample4 = deconv_layer(conv_decode5_3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4")
  conv_decode4_1 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, True, name="conv_decode4_1")
  conv_decode4_2 = conv_layer_with_bn(conv_decode4_1, [7, 7, 64, 64], phase_train, True, name="conv_decode4_2")
  conv_decode4_3 = conv_layer_with_bn(conv_decode4_2, [7, 7, 64, 64], phase_train, True, name="conv_decode4_3")

  upsample3 = deconv_layer(conv_decode4_3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3")
  conv_decode3_1 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, True, name="conv_decode3_1")
  conv_decode3_2 = conv_layer_with_bn(conv_decode3_1, [7, 7, 64, 64], phase_train, True, name="conv_decode3_2")
  conv_decode3_3 = conv_layer_with_bn(conv_decode3_2, [7, 7, 64, 64], phase_train, True, name="conv_decode3_3")

  upsample2= deconv_layer(conv_decode3_3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2")
  conv_decode2_1 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, True, name="conv_decode2_1")
  conv_decode2_2 = conv_layer_with_bn(conv_decode2_1, [7, 7, 64, 64], phase_train, True, name="conv_decode2_2")

  upsample1 = deconv_layer(conv_decode2_2, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1")
  conv_decode1_1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_1")
  conv_decode1_2 = conv_layer_with_bn(conv_decode1_1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_2")
  """ End of decoder """

  """ Start Classify """
  # output predicted class number
  with tf.variable_scope('conv_classifier') as scope: #all variables prefixed with "conv_classifier/"
    kernel = _variable_with_weight_decay('weights',
                                     shape=[1, 1, 64, FLAGS.num_class],
                                     initializer=msra_initializer(1, 64),
                                     wd=0.0005)
    conv = tf.nn.conv2d(conv_decode1_2, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0))
    conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name) 
    #logit = conv_classifier = prediction
  return conv_classifier

卷积层:

def conv_layer_with_bn(inputT, shape, train_phase, activation=True, name=None):

  in_channel = shape[2]
  out_channel = shape[3]
  k_size = shape[0]

  with tf.variable_scope(name) as scope:
      kernel = _variable_with_weight_decay('weights',
                                     shape=shape,

        initializer=msra_initializer(k_size, in_channel), 
                                     wd=None)
      conv = tf.nn.conv2d(inputT, kernel, [1, 1, 1, 1], padding='SAME')
      biases = _variable_on_cpu('biases', [out_channel], tf.constant_initializer(0.0))
      bias = tf.nn.bias_add(conv, biases)

      if activation is True:
         conv_out = tf.nn.relu(batch_norm_layer(bias, train_phase, scope.name))
      else:
         conv_out = batch_norm_layer(bias, train_phase, scope.name)

 return conv_out

def batch_norm_layer(inputT, is_training, scope):
      """Used in conv_layer_with_bn()"""
  return tf.cond(is_training,
          lambda: tf.contrib.layers.batch_norm(inputT, is_training=True,
                           center=False, updates_collections=None, scope=scope+"_bn"),
          lambda: tf.contrib.layers.batch_norm(inputT, is_training=False,
                           updates_collections=None, center=False, scope=scope+"_bn", reuse = True))  

扩展模型获得大约 10% 的 mIoU,因为图像中的所有像素都被归类为“未构建”类。谁能帮我理解为什么会这样?看过SegNet的caffe实现,看不出两种实现的区别。

4

1 回答 1

0

经过一番测试,我想我可能知道了更深层模型性能不佳的原因。这似乎是权重初始化的一个问题,我想这在更深层次的模型中更为关键。我已经更新了我的模型以使用Delving Deep into Rectifiers论文中提出的权重初始化器,以及随机梯度下降和 0.1 的学习率。这似乎可以解决问题!

我的想法正确吗?当使用更深的模型时,权重初始化是否变得如此重要?

于 2017-05-15T14:17:27.287 回答