0

我刚开始学习关于图像字幕的 CNN 预训练架构,我正在尝试运行一些代码。这段代码在我使用 resnet_101 时运行良好,但在我尝试使用 mobilenet_v2 输入 224x224 并且特征提取输出为 7x7,1280 时却没有。此外,我正在尝试将其扩展到 Bahdanau Attention,但我遇到了一些错误。

这是MobileNet编码器的代码

class MobileNetV2Encoder(nn.Module):

def __init__(self, hidden_size=1280):
    super(MobileNetV2Encoder, self).__init__()
    mobilenet = torchvision.models.mobilenet_v2(pretrained=True)
    modules = list(mobilenet.children())[:-1]
    self.hidden_size = hidden_size
    self.mobilenet = nn.Sequential(*modules)
    
def fine_tuning_mobilenet(self, fine_tune):
    for p in self.mobilenet.parameters():
        p.requires_grad = False
    for c in list(self.mobilenet.children())[5:]:
        for p in c.parameters():
            p.requires_grad = fine_tune
    
def forward(self, images):
    """
    :param
        images: Tensor[batch_size, 3, img_size, img_size]
    :return
        out: Tensor[batch_size, 8, 8, hidden_size]
    """
    out = self.mobilenet(images)
    out = out.permute(0, 2, 3, 1)
    return out

def test_encoder():
  encoder = MobileNetEncoder()
  latent = encoder(torch.rand((10, 3, 224, 224)))
  assert latent.size() == torch.Size([10, 7, 7, 1280]), latent.size() 
test_encoder()

这里是巴赫达瑙

class BahdanauAttention(nn.Module):

def __init__(self, enc_hidden_size, dec_hidden_size, hidden_size):
    super(BahdanauAttention, self).__init__()
    self.W1 = nn.Linear(enc_hidden_size, hidden_size)
    self.W2 = nn.Linear(dec_hidden_size, hidden_size)
    self.V = nn.Linear(hidden_size, 1)

def forward(self, features, h_state):
    h_state = h_state.unsqueeze(1) # [batch_size, 1, dec_hidden_size]
    score = F.elu(self.W1(features) + self.W2(h_state)) # [batch_size, num_pixels, hidden_size]
    attention_weights = F.softmax(self.V(score), dim=1) # [batch_size, num_pixels, 1]
    context_vector = attention_weights * features # [batch_size, num_pixels, enc_hidden_size]
    context_vector = torch.sum(context_vector, dim=1) # [batch_size, enc_hidden_size]
    return context_vector, attention_weights.squeeze(2)
def test_attention():
  attention = BahdanauAttention(enc_hidden_size=1280, dec_hidden_size=512, hidden_size=512)
  context_vector, attention_weights = attention(torch.rand((10, 7*7, 1280)), torch.rand((10, 320)))
  assert context_vector.size() == torch.Size([10, 1280])
  assert attention_weights.size() == torch.Size([10, 7*7])
test_attention()

我在放置自动编码器时出错了

def test_autoencoder():
encoder = MobileNetV2Encoder()
decoder = DecoderWithBahdanauAttention(enc_hidden_size=1280,
                                       attn_hidden_size=320,
                                       hidden_size=320,
                                       embedding_size=320,
                                       vocab_size=1000,
                                       dropout=0.5)
autoencoder = AutoEncoder(encoder=encoder, decoder=decoder, device='cpu')
logits, alphas, sorted_target_sequences, sorted_decode_lengths, sorted_indices = \
    autoencoder(torch.rand((10, 3, 160, 160)),
                torch.randint(low=0, high=1000, size=(10, 25)),
                torch.randint(low=5, high=26, size=(10,)), 0.5)
assert logits.size() == torch.Size([max(sorted_decode_lengths), 10, 1000])
assert alphas.size() == torch.Size([10, max(sorted_decode_lengths), 7*7])
assert len(sorted_decode_lengths) == 10
assert sorted_target_sequences.size() == torch.Size([10, 25])
assert len(sorted_indices) == 10

test_autoencoder()

错误在行中

assert logits.size() == torch.Size([max(sorted_decode_lengths), 10, 1000])
assert alphas.size() == torch.Size([10, max(sorted_decode_lengths), 7*7])

它的错误

有人可以帮我解决这个错误吗?

4

0 回答 0