我正在尝试实现一个程序,将 LSTM 的性能与 GRU 的单词预测性能进行比较。我对它们都使用相同的参数,但是当我为 LSTM 获得良好的困惑值时,我获得的 GRU 值绝对是糟糕的。我最近尝试调试训练函数,因为它最初只针对 LSTM 模型而不是 GRU 模型。正如我已经说过的,这两个模型应该得到相似的值,但是现在 LSTM 模型从大约 150 的困惑度开始并收敛到正常值,而 GRU 模型从 1000 秒内的某个随机值开始,根本不收敛.
我对所有 RNN、LSTM 和 GRU 的东西都很陌生,所以如果我遗漏了一些明显的东西,请原谅我。任何帮助将不胜感激!
我使用以下两种模型:
class LSTM_Model(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
super(LSTM_Model, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden_state):
x = self.embed(x)
out, (hidden_state, cell_state) = self.lstm(x, hidden_state)
out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
out = self.fc(out)
return out, (hidden_state, cell_state)
class GRU_Model(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
super(GRU_Model, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden_state):
x = self.embed(x)
out, hidden_state = self.gru(x, hidden_state)
out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
out = self.fc(out)
return out, hidden_state
训练功能:
def run_model(model, epochs=epochs, learning_rate=learning_rate, clip=clip, momentum=momentum, LSTM=True, GRU=False, Dropout=False):
# Define loss criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=decay_rate)
train_perplexity, test_perplexity, valid_perplexity = [], [], []
# Train the model
for e in range(epochs):
# Set all initial hidden and cell states to zeroes
train_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
test_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
valid_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
# RUN TRAINING SET #
model.train()
for i in range(0, ids.size(1) - seq_length, seq_length):
# Set train_inputs and train_targets
train_inputs = ids[:, i:i+seq_length].to(device)
train_targets = ids[:, (i+1):(i+1)+seq_length].to(device)
# Forward pass
model.zero_grad()
if(LSTM==True):
train_states = [state.detach() for state in train_states] # Detach the hidden state from how it was previously produced
if(GRU==True):
train_states = train_states.data #detach?
train_outputs, train_states = model(train_inputs, train_states)
train_loss = criterion(train_outputs, train_targets.reshape(-1))
# Backward and optimize
train_loss.backward()
clip_grad_norm_(model.parameters(), clip)
optimizer.step()
lr_scheduler.step()
model.eval()
with torch.no_grad():
#test and validation, removed to reduce length
model.train() # reset to train mode after iterating through validation data
train_perplexity.append(math.exp(train_loss.item()))
test_perplexity.append(np.exp(np.mean(test_losses)))
valid_perplexity.append(np.exp(np.mean(valid_losses)))
print('Epoch ' + str(e+1) + '/' + str(epochs) + ': ')
print('Train Perplexity - ' + str(train_perplexity[e]))
print('Test Perplexity - ' + str(test_perplexity[e]))
print('Validation Perplexity - ' + str(valid_perplexity[e]))
print("----------------------------------------------------")
return train_perplexity, test_perplexity, valid_perplexity
隐藏状态初始化:
def init_states(LSTM, GRU, num_layers=num_layers, batch_size=batch_size, hidden_size=hidden_size):
if (LSTM==True):
return (torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device),
torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device))
if (GRU==True):
return torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device)