我正在关注 D2L 书籍,GoogLeNet 章节的练习之一是从 Rethinking the Inception Architecture for Computer Vision 论文中获取模型,所以我这样做了,并且基本上基于 Inception V1 架构重新创建了 Inception V2 架构。 D2L 书,我唯一没有包括的是论文中描述的网格缩减。我希望这会改善我的模型的结果,但如果有的话,它会使它们变得更糟。我正在使用 GoogLeNet 章节中关于时尚_mnist 数据集的 D2L 代码测试模型,并使用 Inception V1 获得了 0.908 的测试准确度,但是在我的 V2 实现中,我只得到了 0.885 的测试 acc,这是我能做到的最高得到。我预计 V2 模型会更好,所以我真的不知道我哪里出错了,
这是前 3 个 inception 块的代码(来自论文中的图 5)
class InceptionB1(nn.Module):
# `c1`--`c4` are the number of output channels for each path
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(InceptionB1, self).__init__(**kwargs)
# Path 1 is a single 1 x 1 convolutional layer
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# Path 2 is a 1 x 1 convolutional layer followed by a 3 x 3
# convolutional layer
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
# Path 3 is a 1 x 1 convolutional layer followed by 2 3x3 convs (factorization)
# convolutional layer
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=3, padding=1)
# Replacing path3_2 with modified path using factorization through smaller convolutions
self.p3_3 = nn.Conv2d(c3[1], c3[1], kernel_size=3, padding = 1)
# Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
# convolutional layer
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_3(self.p3_2(F.relu(self.p3_1(x)))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
# Concatenate the outputs on the channel dimension
return torch.cat((p1, p2, p3, p4), dim=1)
这是我中间 5 个块的代码(论文中的图 6)
class InceptionB2(nn.Module):
# `c1`--`c4` are the number of output channels for each path
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(InceptionB2, self).__init__(**kwargs)
# Path 1 is a single 1 x 1 convolutional layer
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# Path 2 is a 1 x 1 convolutional layer followed by a 1 x 3 then 3x1 conv
# convolutional layer
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=(1,3), padding=1)
self.p2_3 = nn.Conv2d(c2[1], c2[1], kernel_size=(3,1), padding=0)
# Path 3 is a 1 x 1 conv layer followed by a 1x3 conv then 3x1 then 1x3 then 3x1
# convolutional layer
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=(1,3), padding=1)
self.p3_3 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=0)
self.p3_4 = nn.Conv2d(c3[1], c3[1], kernel_size=(1,3), padding=0)
self.p3_5 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=1)
# Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
# convolutional layer
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_3(F.relu(self.p2_2(F.relu(self.p2_1(x))))))
p3 = F.relu(self.p3_5(F.relu(self.p3_4(F.relu(self.p3_3(F.relu(self.p3_2(F.relu(self.p3_1(x))))))))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
# Concatenate the outputs on the channel dimension
return torch.cat((p1, p2, p3, p4), dim=1)
这是我最后 2 个块的代码(论文中的图 7)
class InceptionB3(nn.Module):
# `c1`--`c4` are the number of output channels for each path
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(InceptionB3, self).__init__(**kwargs)
# Path 1 is a single 1 x 1 convolutional layer
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# Path 2 is a 1 x 1 convolutional layer followed by a 1x3 and 3x1
# convolutional layer
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2_1 = nn.Conv2d(c2[0], c2[1], kernel_size=(1,3), padding=0)
self.p2_2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=(3,1), padding=0)
# Path 3 is a 1 x 1 convolutional layer followed by a 3x3 then 1x3 and 3x1
# convolutional layer
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=3, padding=1)
self.p3_3_1 = nn.Conv2d(c3[1], c3[1], kernel_size=(1,3), padding=0)
self.p3_3_2 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=0)
# Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
# convolutional layer
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2_1 = F.relu(self.p2_2_1(F.relu(self.p2_1(x))))
p2_2 = F.relu(elf.p2_2_2(F.relu(self.p2_1(x))))
p2 = torch.matmul(p2_1, p2_2)
p3_1 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p3_2_1 = F.relu(self.p3_3_1(p3_1))
p3_2_2 = F.relu(sself.p3_3_2(p3_1))
p3 = torch.matmul(p3_2_1, p3_2_2)
p4 = F.relu(self.p4_2(self.p4_1(x)))
# Concatenate the outputs on the channel dimension
return torch.cat((p1, p2, p3, p4), dim=1)
这是 D2L 章节的链接 https://d2l.ai/chapter_convolutional-modern/googlenet.html