我刚开始学习关于图像字幕的 CNN 预训练架构,我正在尝试运行一些代码。这段代码在我使用 resnet_101 时运行良好,但在我尝试使用 mobilenet_v2 输入 224x224 并且特征提取输出为 7x7,1280 时却没有。此外,我正在尝试将其扩展到 Bahdanau Attention,但我遇到了一些错误。
这是MobileNet编码器的代码
class MobileNetV2Encoder(nn.Module):
def __init__(self, hidden_size=1280):
super(MobileNetV2Encoder, self).__init__()
mobilenet = torchvision.models.mobilenet_v2(pretrained=True)
modules = list(mobilenet.children())[:-1]
self.hidden_size = hidden_size
self.mobilenet = nn.Sequential(*modules)
def fine_tuning_mobilenet(self, fine_tune):
for p in self.mobilenet.parameters():
p.requires_grad = False
for c in list(self.mobilenet.children())[5:]:
for p in c.parameters():
p.requires_grad = fine_tune
def forward(self, images):
"""
:param
images: Tensor[batch_size, 3, img_size, img_size]
:return
out: Tensor[batch_size, 8, 8, hidden_size]
"""
out = self.mobilenet(images)
out = out.permute(0, 2, 3, 1)
return out
def test_encoder():
encoder = MobileNetEncoder()
latent = encoder(torch.rand((10, 3, 224, 224)))
assert latent.size() == torch.Size([10, 7, 7, 1280]), latent.size()
test_encoder()
这里是巴赫达瑙
class BahdanauAttention(nn.Module):
def __init__(self, enc_hidden_size, dec_hidden_size, hidden_size):
super(BahdanauAttention, self).__init__()
self.W1 = nn.Linear(enc_hidden_size, hidden_size)
self.W2 = nn.Linear(dec_hidden_size, hidden_size)
self.V = nn.Linear(hidden_size, 1)
def forward(self, features, h_state):
h_state = h_state.unsqueeze(1) # [batch_size, 1, dec_hidden_size]
score = F.elu(self.W1(features) + self.W2(h_state)) # [batch_size, num_pixels, hidden_size]
attention_weights = F.softmax(self.V(score), dim=1) # [batch_size, num_pixels, 1]
context_vector = attention_weights * features # [batch_size, num_pixels, enc_hidden_size]
context_vector = torch.sum(context_vector, dim=1) # [batch_size, enc_hidden_size]
return context_vector, attention_weights.squeeze(2)
def test_attention():
attention = BahdanauAttention(enc_hidden_size=1280, dec_hidden_size=512, hidden_size=512)
context_vector, attention_weights = attention(torch.rand((10, 7*7, 1280)), torch.rand((10, 320)))
assert context_vector.size() == torch.Size([10, 1280])
assert attention_weights.size() == torch.Size([10, 7*7])
test_attention()
我在放置自动编码器时出错了
def test_autoencoder():
encoder = MobileNetV2Encoder()
decoder = DecoderWithBahdanauAttention(enc_hidden_size=1280,
attn_hidden_size=320,
hidden_size=320,
embedding_size=320,
vocab_size=1000,
dropout=0.5)
autoencoder = AutoEncoder(encoder=encoder, decoder=decoder, device='cpu')
logits, alphas, sorted_target_sequences, sorted_decode_lengths, sorted_indices = \
autoencoder(torch.rand((10, 3, 160, 160)),
torch.randint(low=0, high=1000, size=(10, 25)),
torch.randint(low=5, high=26, size=(10,)), 0.5)
assert logits.size() == torch.Size([max(sorted_decode_lengths), 10, 1000])
assert alphas.size() == torch.Size([10, max(sorted_decode_lengths), 7*7])
assert len(sorted_decode_lengths) == 10
assert sorted_target_sequences.size() == torch.Size([10, 25])
assert len(sorted_indices) == 10
test_autoencoder()
错误在行中
assert logits.size() == torch.Size([max(sorted_decode_lengths), 10, 1000])
assert alphas.size() == torch.Size([10, max(sorted_decode_lengths), 7*7])
有人可以帮我解决这个错误吗?