python - 为什么我的函数对 LSTM 有很好的价值，但对 GRU 没有？

Question

我正在尝试实现一个程序，将 LSTM 的性能与 GRU 的单词预测性能进行比较。我对它们都使用相同的参数，但是当我为 LSTM 获得良好的困惑值时，我获得的 GRU 值绝对是糟糕的。我最近尝试调试训练函数，因为它最初只针对 LSTM 模型而不是 GRU 模型。正如我已经说过的，这两个模型应该得到相似的值，但是现在 LSTM 模型从大约 150 的困惑度开始并收敛到正常值，而 GRU 模型从 1000 秒内的某个随机值开始，根本不收敛.

我对所有 RNN、LSTM 和 GRU 的东西都很陌生，所以如果我遗漏了一些明显的东西，请原谅我。任何帮助将不胜感激！

我使用以下两种模型：

class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
        super(LSTM_Model, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden_state):
        x = self.embed(x) 
        out, (hidden_state, cell_state) = self.lstm(x, hidden_state) 
        out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
        out = self.fc(out) 
        return out, (hidden_state, cell_state)


class GRU_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
        super(GRU_Model, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden_state):
        x = self.embed(x) 
        out, hidden_state = self.gru(x, hidden_state) 
        out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
        out = self.fc(out) 
        return out, hidden_state

训练功能：

def run_model(model, epochs=epochs, learning_rate=learning_rate, clip=clip, momentum=momentum, LSTM=True, GRU=False, Dropout=False):
  # Define loss criterion and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
  lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=decay_rate)

  train_perplexity, test_perplexity, valid_perplexity = [], [], []

  # Train the model
  for e in range(epochs):
      # Set all initial hidden and cell states to zeroes
      train_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
      test_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
      valid_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)

      
      # RUN TRAINING SET #
      model.train()
      for i in range(0, ids.size(1) - seq_length, seq_length):
          # Set train_inputs and train_targets
          train_inputs = ids[:, i:i+seq_length].to(device)
          train_targets = ids[:, (i+1):(i+1)+seq_length].to(device)
          
          # Forward pass
          model.zero_grad() 
          if(LSTM==True):
              train_states = [state.detach() for state in train_states] # Detach the hidden state from how it was previously produced
          if(GRU==True):
              train_states = train_states.data #detach?
          train_outputs, train_states = model(train_inputs, train_states)
          train_loss = criterion(train_outputs, train_targets.reshape(-1))
          
          # Backward and optimize
          train_loss.backward()
          clip_grad_norm_(model.parameters(), clip)
          optimizer.step()

      lr_scheduler.step() 
      model.eval()
      with torch.no_grad(): 
       #test and validation, removed to reduce length

      model.train() # reset to train mode after iterating through validation data
      train_perplexity.append(math.exp(train_loss.item()))
      test_perplexity.append(np.exp(np.mean(test_losses)))
      valid_perplexity.append(np.exp(np.mean(valid_losses)))

      print('Epoch ' + str(e+1) + '/' + str(epochs) + ': ')
      print('Train Perplexity - ' + str(train_perplexity[e]))
      print('Test Perplexity - ' + str(test_perplexity[e]))
      print('Validation Perplexity - ' + str(valid_perplexity[e]))
      print("----------------------------------------------------")
  return train_perplexity, test_perplexity, valid_perplexity

隐藏状态初始化：

def init_states(LSTM, GRU, num_layers=num_layers, batch_size=batch_size, hidden_size=hidden_size):
    if (LSTM==True):
        return (torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device),
                    torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device))
    if (GRU==True):
        return torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device)

python - 为什么我的函数对 LSTM 有很好的价值，但对 GRU 没有？

0 回答 0

Related

Reference