我想在 PyTorch 中为 FashionMNIST 数据集构建一个 LSTM 模型。稍后我需要将其扩展到包含视频的不同数据集。
它应该得到一个图像序列(FashionMNIST)作为输入(比如说 20 张图像),输出应该告诉我序列中有多少运动鞋(第 6 类)以及它们在序列中的位置。
我想知道这是否可以使用简单的 LSTM 或简单的 CNN,或者我是否需要 CNN_LSTM?我试图在 PyTorch 中实现一个 CNN_LSTM。您可以在下面找到我当前的模型(现在会引发错误)。最后一行抛出以下错误:“输入必须有 3 个维度,得到 4 个”(我还添加了错误消息的第一部分作为图片)。有人可以提供一些帮助吗?我这样做的方式正确吗?我无法修复错误,我不确定我的其余代码是否正确。我对 LSTM 很陌生。另外,如何转换 FashionMNIST 数据集,使其始终以 20 张图像的序列出现?
提前谢谢了!
class CNN(nn.Module):
def __init__(self, K):
super(CNN, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2))
# three fully connected layer
self.fc1 = nn.Linear(in_features=64*6*6, out_features=600)
self.drop = nn.Dropout2d(0.25)
self.fc2 = nn.Linear(in_features=600, out_features=120)
self.fc3 = nn.Linear(in_features=120, out_features=10)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.view(out.size(0), -1)
out = self.fc1(out)
out = self.drop(out)
out = self.fc2(out)
out = self.fc3(out)
return out
class Combine(nn.Module):
def __init__(self, K):
super(Combine, self).__init__()
self.cnn = CNN(K)
self.D = 10 # n_inputs
self.M = 128 # n_hidden
self.K = 2 # n_outputs
self.L = 10 # n_rnnlayers
self.rnn = nn.LSTM(
input_size=self.D,
hidden_size=self.M,
num_layers=self.L,
batch_first=True)
self.fc =nn.Linear(self.M, self.K)
def forward(self, X):
# initial hidden states
h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
c0 = torch.zeros(self.L, X.size(0), self.M).to(device)
# get RNN unit output
out, _ = self.rnn(X, (h0, c0))
out = self.fc(out)
return out
model = Combine(K)
# use GPU in colab if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
# Loss and optimizer
learning_rate = 0.001
criterion = nn.CrossEntropyLoss() # because mutli-class classification (includes softmax activation function for multi-class already)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Training and testing the model
def batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs):
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)
# iterate over epochs
for it in range(epochs):
model.train()
t0 = datetime.now()
train_loss = []
for inputs, targets in train_loader:
# move data to GPU
#inputs = inputs.reshape(-1,28,28)
inputs, targets = inputs.to(device), targets.to(device)
# zero the parameter gradients (empty gradients) for backward pass
# Initializing a gradient as 0 so there is no mixing of gradient among the batches
optimizer.zero_grad()
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, targets)
# Backward and optimize
loss.backward() # propagating the error backward
optimizer.step() # optimizing the parameters
train_loss.append(loss.item())
# Get train loss and test loss
train_loss = np.mean(train_loss) # a little misleading
# evaluate model
model.eval()
test_loss = []
for inputs, targets in test_loader: # test samples and targets
# move data to GPU
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
test_loss.append(loss.item())
test_loss = np.mean(test_loss)
# Save losses
train_losses[it] = train_loss
test_losses[it] = test_loss
dt = datetime.now() - t0
print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
Test Loss: {test_loss:.4f}, Duration: {dt}')
return train_losses, test_losses
train_losses, test_losses = batch_gd(
model, criterion, optimizer, train_loader, test_loader, epochs=15)