deep-learning - 从头开始开发基于 caffe 的定制卷积神经网络

Question

我喜欢从头开始开发一个新的卷积神经网络，在这里找到了一个关于如何将预训练模型用于新网络的讨论。有四个类别，如果训练图像尺寸小，数据相似度低，我们需要冻结最初的 K 层并训练后面的 NK 层。该讨论以 VGG16 为例。假设新网络基于 VGG16，我们可以使用 VGG16 预训练模型。但我喜欢有如下新的网络结构。网络是一个只有 6 层的小型网络。我只有 1200 个训练图像。

我如何开始使用这个新网络？

layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 6
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "conv1"
  top: "conv2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 6
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 3
  }
}
layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 16
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "conv3"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "conv3"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv4"
  type: "Convolution"
  bottom: "pool3"
  top: "conv4"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 32
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu4"
  type: "ReLU"
  bottom: "conv4"
  top: "conv4"
}
layer {
  name: "pool4"
  type: "Pooling"
  bottom: "conv4"
  top: "pool4"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv5"
  type: "Convolution"
  bottom: "pool4"
  top: "conv5"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu5"
  type: "ReLU"
  bottom: "conv5"
  top: "conv5"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv6"
  type: "Convolution"
  bottom: "pool5"
  top: "conv6"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "conv6"
  top: "conv6"
}
layer {
  name: "pool6"
  type: "Pooling"
  bottom: "conv6"
  top: "pool6"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}

score 0 · Accepted Answer

我只是从那里复制那些面临类似问题的回复。

You do not need a pre-trained model. Actually, when you are developing your own, custom architecture, you cannot even get a pre-trained model.

1200 images is not much, but without any other information, one cannot tell if it is enough or not. Maybe your problem is simple, your network small and the dataset will be enough. Maybe your problem is tough and even 109 images would not be enough. There is no simple answer to that.

Also, there is no simple answer to which of the two options will be better. I would start training from scratch, using augmented dataset. Ideally, generate the augmented versions on the fly rather than creating a fixed amount (e.g. 18) from each image. If you observe poor performance, pre-train on VOC and then fine-tune on your own data (but still use augmentation). This is the only way to compare which is better.

As final remarks, I would recommend this article summarizing some good tips for neural network development and debugging; and this thread focused on learning problems.

deep-learning - 从头开始开发基于 caffe 的定制卷积神经网络

1 回答 1

Related

Reference