python - 为什么使用 ResNet50 架构的连体网络比从头训练的网络效果差？

Question

我正在尝试构建基于 ResNet50 架构的产品识别工具，如下所示

def get_siamese_model(input_shape):
    # Define the tensors for the two input images
    left_input = Input(input_shape)
    right_input = Input(input_shape)

    # Convolutional Neural Network
    model = Sequential()

    model.add(ResNet50(include_top=False, pooling='avg', weights='imagenet'))
    model.layers.pop()

    for layer in model.layers:
        layer.trainable = False

    model.add(Dense(4096, 
                    activation='sigmoid',
                    kernel_regularizer=l2(1e-3),
                    kernel_initializer=initialize_weights,bias_initializer=initialize_bias))

    # Generate the encodings (feature vectors) for the two images
    encoded_l = model(left_input)
    encoded_r = model(right_input)

    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1,activation='sigmoid',bias_initializer=initialize_bias)(L1_distance)

    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[left_input,right_input],outputs=prediction)

    # return the model
    return siamese_net

有生成训练批次的功能

def gen_random_batch(in_groups, batch_halfsize = 8):
    out_img_a, out_img_b, out_score = [], [], []
    all_groups = list(range(len(in_groups)))
    for match_group in [True, False]:
        group_idx = np.random.choice(all_groups, size = batch_halfsize)
        out_img_a += [in_groups[c_idx][np.random.choice(range(in_groups[c_idx].shape[0]))] for c_idx in group_idx]
        if match_group:
            b_group_idx = group_idx
            out_score += [1]*batch_halfsize
        else:
            # anything but the same group
            non_group_idx = [np.random.choice([i for i in all_groups if i!=c_idx]) for c_idx in group_idx] 
            b_group_idx = non_group_idx
            out_score += [0]*batch_halfsize

        out_img_b += [in_groups[c_idx][np.random.choice(range(in_groups[c_idx].shape[0]))] for c_idx in b_group_idx]

    return np.stack(out_img_a,0), np.stack(out_img_b,0), np.stack(out_score,0)

def siam_gen(in_groups, batch_size = 32):
    while True:
        pv_a, pv_b, pv_sim = gen_random_batch(train_groups, batch_size//2)
        yield [pv_a, pv_b], pv_sim

权重和偏差设置

def initialize_weights(shape, dtype=None):
    return K.random_normal(shape, dtype=dtype, mean=0.0, stddev=0.01)

def initialize_bias(shape, dtype=None):
    return K.random_normal(shape, dtype=dtype, mean=0.5, stddev=0.01)

模型编译和训练

model = get_siamese_model(x_train.shape[1:])
optimizer = Adam(lr = 0.0001)
model.compile(loss="binary_crossentropy",optimizer=optimizer, metrics=['accuracy'])

valid_a, valid_b, valid_sim = gen_random_batch(test_groups, 1024)

loss_history = model.fit_generator(siam_gen(train_groups), 
                               steps_per_epoch = 100,
                               validation_data=([valid_a, valid_b], valid_sim),
                               epochs = 5,
                               verbose = True)

我想知道为什么这个网络无法识别几乎相似的图片。我还使用此处描述的架构构建了另一个网络http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf 这个网络在识别产品方面确实做得很好。

def get_siamese_model(input_shape):

    # Define the tensors for the two input images
    left_input = Input(input_shape)
    right_input = Input(input_shape)

    # Convolutional Neural Network
    model = Sequential()
    model.add(Conv2D(64, (10,10), 
                     activation='relu', 
                     input_shape=input_shape,
                     kernel_initializer=initialize_weights,
                     kernel_regularizer=l2(2e-4)))
    model.add(MaxPooling2D())
    model.add(Conv2D(128, (7,7), 
                     activation='relu',
                     kernel_initializer=initialize_weights,
                     bias_initializer=initialize_bias, 
                     kernel_regularizer=l2(2e-4)))
    model.add(MaxPooling2D())
    model.add(Conv2D(128, (4,4), activation='relu', 
                     kernel_initializer=initialize_weights,
                     bias_initializer=initialize_bias, 
                     kernel_regularizer=l2(2e-4)))
    model.add(MaxPooling2D())
    model.add(Conv2D(256, (4,4), activation='relu', 
                     kernel_initializer=initialize_weights,
                     bias_initializer=initialize_bias, 
                     kernel_regularizer=l2(2e-4)))
    model.add(Flatten())
    model.add(Dense(4096, 
                    activation='sigmoid',
                    kernel_regularizer=l2(1e-3),
                    kernel_initializer=initialize_weights,bias_initializer=initialize_bias))

    # Generate the encodings (feature vectors) for the two images
    encoded_l = model(left_input)
    encoded_r = model(right_input)

    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1,activation='sigmoid',bias_initializer=initialize_bias)(L1_distance)

    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[left_input,right_input],outputs=prediction)

    # return the model
    return siamese_net

我想知道为什么使用预训练层的网络比从一开始就训练的结果更差。

score 0 · Accepted Answer

尝试标准化您的输入图像，例如img/255. Resnet 的初始权重可以作为输入归一化图像。

python - 为什么使用 ResNet50 架构的连体网络比从头训练的网络效果差？

1 回答 1

Related

Reference