python - GRU 语言模型未正确训练

Question

我尝试使用 GRU 和线性层重新实现一个简单的 GRU 语言模型（完整代码也在https://www.kaggle.com/alvations/gru-language-model-not-training-properly）：

class Generator(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers):
        super(Generator, self).__init__()

        # Initialize the embedding layer with the 
        # - size of input (i.e. no. of words in input vocab)
        # - no. of hidden nodes in the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)

        # Initialize the GRU with the 
        # - size of the input (i.e. embedding layer)
        # - size of the hidden layer 
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers)

        # Initialize the "classifier" layer to map the RNN outputs
        # to the vocabulary. Remember we need to -1 because the 
        # vectorized sentence we left out one token for both x and y:
        # - size of hidden_size of the GRU output.
        # - size of vocabulary
        self.classifier = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs, use_softmax=False, hidden=None):
        # Look up for the embeddings for the input word indices.
        embedded = self.embedding(inputs)
        # Put the embedded inputs into the GRU.
        output, hidden = self.gru(embedded, hidden)

        # Matrix manipulation magic.
        batch_size, sequence_len, hidden_size = output.shape
        # Technically, linear layer takes a 2-D matrix as input, so more manipulation...
        output = output.contiguous().view(batch_size * sequence_len, hidden_size)
        # Put it through the classifier
        # And reshape it to [batch_size x sequence_len x vocab_size]
        output = self.classifier(output).view(batch_size, sequence_len, -1)

        return (F.softmax(output,dim=2), hidden) if use_softmax else (output, hidden)


    def generate(self, max_len, temperature=1.0):
        pass

和训练程序：

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set the hidden_size of the GRU 
embed_size = 100
hidden_size = 100
num_layers = 1

# Setup the data.
batch_size=50
kilgariff_data = KilgariffDataset(tokenized_text)
dataloader = DataLoader(dataset=kilgariff_data, batch_size=batch_size, shuffle=True)

criterion = nn.CrossEntropyLoss(ignore_index=kilgariff_data.vocab.token2id['<pad>'], size_average=True)
model = Generator(len(kilgariff_data.vocab), embed_size, hidden_size, num_layers).to(device)

learning_rate = 0.003
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#model = nn.DataParallel(model)

losses = []

def train(num_epochs, dataloader, model, criterion, optimizer):
    plt.ion()
    for _e in range(num_epochs):
        for batch in tqdm(dataloader):
            x = batch['x'].to(device)
            x_len = batch['x_len'].to(device)
            y = batch['y'].to(device)
            # Zero gradient.
            optimizer.zero_grad()
            # Feed forward. 
            output, hidden = model(x, use_softmax=True)
            # Compute loss:
            # Shape of the `output` is [batch_size x sequence_len x vocab_size]
            # Shape of `y` is [batch_size x sequence_len]
            # CrossEntropyLoss expects `output` to be [batch_size x vocab_size x sequence_len]

            _, prediction = torch.max(output, dim=2)
            loss = criterion(output.permute(0, 2, 1), y)
            loss.backward()
            optimizer.step()
            losses.append(loss.float().data)

            clear_output(wait=True)
            plt.plot(losses)
            plt.pause(0.05)


train(50, dataloader, model, criterion, optimizer)

#learning_rate = 0.05
#optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#train(4, dataloader, model, criterion, optimizer)

但是当模型进行预测时，我们看到它只是预测“the”和逗号“,”。

有人发现我的代码有问题吗？还是超参数？

完整代码：

# coding: utf-8

# In[1]:


# IPython candies...
from IPython.display import Image
from IPython.core.display import HTML

from IPython.display import clear_output


# In[2]:


import numpy as np
from tqdm import tqdm

import pandas as pd

from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

device = 'cuda' if torch.cuda.is_available() else 'cpu'


# In[3]:


import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(12, 8)})


torch.manual_seed(42)


# In[4]:


try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize


# In[5]:


import os
import requests
import io #codecs


# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)


# In[6]:


# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
                  for sent in sent_tokenize(text)]


# In[7]:


class KilgariffDataset(nn.Module):
    def __init__(self, texts):
        self.texts = texts

        # Initialize the vocab 
        special_tokens = {'<pad>': 0, '<unk>':1, '<s>':2, '</s>':3}
        self.vocab = Dictionary(texts)
        self.vocab.patch_with_special_tokens(special_tokens)

        # Keep track of the vocab size.
        self.vocab_size = len(self.vocab)

        # Keep track of how many data points.
        self._len = len(texts)

        # Find the longest text in the data.
        self.max_len = max(len(txt) for txt in texts) 

    def __getitem__(self, index):
        vectorized_sent = self.vectorize(self.texts[index])
        x_len = len(vectorized_sent)
        # To pad the sentence:
        # Pad left = 0; Pad right = max_len - len of sent.
        pad_dim = (0, self.max_len - len(vectorized_sent))
        vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
        return {'x':vectorized_sent[:-1], 
                'y':vectorized_sent[1:], 
                'x_len':x_len}

    def __len__(self):
        return self._len

    def vectorize(self, tokens, start_idx=2, end_idx=3):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)

        vectorized_sent = [start_idx] + self.vocab.doc2idx(tokens) + [end_idx]
        return torch.tensor(vectorized_sent)

    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]


# In[8]:


kilgariff_data = KilgariffDataset(tokenized_text)
len(kilgariff_data.vocab)


# In[9]:


batch_size = 10
dataloader = DataLoader(dataset=kilgariff_data, batch_size=batch_size, shuffle=True)

for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break


# In[97]:


class Generator(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers):
        super(Generator, self).__init__()

        # Initialize the embedding layer with the 
        # - size of input (i.e. no. of words in input vocab)
        # - no. of hidden nodes in the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)

        # Initialize the GRU with the 
        # - size of the input (i.e. embedding layer)
        # - size of the hidden layer 
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers)

        # Initialize the "classifier" layer to map the RNN outputs
        # to the vocabulary. Remember we need to -1 because the 
        # vectorized sentence we left out one token for both x and y:
        # - size of hidden_size of the GRU output.
        # - size of vocabulary
        self.classifier = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs, use_softmax=False, hidden=None):
        # Look up for the embeddings for the input word indices.
        embedded = self.embedding(inputs)
        # Put the embedded inputs into the GRU.
        output, hidden = self.gru(embedded, hidden)

        # Matrix manipulation magic.
        batch_size, sequence_len, hidden_size = output.shape
        # Technically, linear layer takes a 2-D matrix as input, so more manipulation...
        output = output.contiguous().view(batch_size * sequence_len, hidden_size)
        # Put it through the classifier
        # And reshape it to [batch_size x sequence_len x vocab_size]
        output = self.classifier(output).view(batch_size, sequence_len, -1)

        return (F.softmax(output,dim=2), hidden) if use_softmax else (output, hidden)


    def generate(self, max_len, temperature=1.0):
        pass


# In[98]:


# Set the hidden_size of the GRU 
embed_size = 12
hidden_size = 10
num_layers = 4

_encoder = Generator(len(kilgariff_data.vocab), embed_size, hidden_size, num_layers)


# In[99]:


# Take a batch.
_batch = next(iter(dataloader))
_inputs, _lengths = _batch['x'], _batch['x_len']
_targets = _batch['y']
max(_lengths)


# In[100]:


_output, _hidden = _encoder(_inputs)
print('Output sizes:\t', _output.shape)
print('Input sizes:\t', batch_size, kilgariff_data.max_len -1, len(kilgariff_data.vocab))
print('Target sizes:\t', _targets.shape)


# In[101]:


_, predicted_indices = torch.max(_output, dim=2)
print(predicted_indices.shape)
predicted_indices


# In[103]:


device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set the hidden_size of the GRU 
embed_size = 100
hidden_size = 100
num_layers = 1

# Setup the data.
batch_size=50
kilgariff_data = KilgariffDataset(tokenized_text)
dataloader = DataLoader(dataset=kilgariff_data, batch_size=batch_size, shuffle=True)

criterion = nn.CrossEntropyLoss(ignore_index=kilgariff_data.vocab.token2id['<pad>'], size_average=True)
model = Generator(len(kilgariff_data.vocab), embed_size, hidden_size, num_layers).to(device)

learning_rate = 0.003
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#model = nn.DataParallel(model)

losses = []

def train(num_epochs, dataloader, model, criterion, optimizer):
    plt.ion()
    for _e in range(num_epochs):
        for batch in tqdm(dataloader):
            x = batch['x'].to(device)
            x_len = batch['x_len'].to(device)
            y = batch['y'].to(device)
            # Zero gradient.
            optimizer.zero_grad()
            # Feed forward. 
            output, hidden = model(x, use_softmax=True)
            # Compute loss:
            # Shape of the `output` is [batch_size x sequence_len x vocab_size]
            # Shape of `y` is [batch_size x sequence_len]
            # CrossEntropyLoss expects `output` to be [batch_size x vocab_size x sequence_len]

            _, prediction = torch.max(output, dim=2)
            loss = criterion(output.permute(0, 2, 1), y)
            loss.backward()
            optimizer.step()
            losses.append(loss.float().data)

            clear_output(wait=True)
            plt.plot(losses)
            plt.pause(0.05)


train(50, dataloader, model, criterion, optimizer)

#learning_rate = 0.05
#optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#train(4, dataloader, model, criterion, optimizer)


# In[ ]:


list(kilgariff_data.vocab.items())


# In[105]:


start_token = '<s>'
hidden_state = None
max_len = 20
temperature=0.8

i = 0

while start_token not in ['</s>', '<pad>'] and i < max_len:
    i += 1
    start_state = torch.tensor(kilgariff_data.vocab.token2id[start_token]).unsqueeze(0).unsqueeze(0).to(device)
    model.embedding(start_state)
    output, hidden_state = model.gru(model.embedding(start_state), hidden_state)

    batch_size, sequence_len, hidden_size = output.shape
    output = output.contiguous().view(batch_size * sequence_len, hidden_size)

    output = model.classifier(output).view(batch_size, sequence_len, -1)
    _, prediction = torch.max(F.softmax(output, dim=2), dim=2)

    start_token = kilgariff_data.vocab[int(prediction.squeeze(0).squeeze(0))]

    print(start_token, end=' ')

score 2 · Accepted Answer

我绝不是 PyTorch 专家，但该片段对我来说看起来很可疑：

    # Put the embedded inputs into the GRU.
    output, hidden = self.gru(embedded, hidden)
    # Matrix manipulation magic.
    batch_size, sequence_len, hidden_size = output.shape
    # Technically, linear layer takes a 2-D matrix as input, so more manipulation...
    output = output.contiguous().view(batch_size * sequence_len, hidden_size)

当GRU没有用实例化时batch_first=True，输出形状是(seq_len, batch, num_directions * hidden_size)-- 不是那个seq_len并且batch_size被翻转。对于 view 命令，它在技术上实际上并不重要，但这是我的主要问题。
view(batch_size * sequence_len, hidden_size)看起来根本不对。假设您从一批大小为 32 的批次开始，但之后您的大小为32*seq_len. 通常，仅使用最后一步的输出（或所有步骤的平均值或最大值）

像这样的东西应该工作：

    # Put the embedded inputs into the GRU.
    output, hidden = self.gru(embedded, hidden)
    # Not needed, just to show the true output shape order
    seq_len, batch_size, hidden_size = output.shape
    # Given the shape of output, this is the last step
    output = output[-1]
    # output.shape = (batch_size, hidden_size) <-- What you want

两个个人的警告词：

view()是危险的命令！PyTorch 或任何其他框架仅在张量的维度不匹配时才会引发错误。但仅仅因为尺寸适合之后view()并不意味着重塑正确完成，即这些值位于输出张量的正确位置。例如，如果你必须将一个形状展平(seq_len, batch_size, hidden_size)为(batch_size, seq_len*hidden_size)，你不能简单地做view(batch_size, -1)，而首先必须做transpose(1,0)来得到一个的形状(batch_size, seq_len, hidden_size)。没有transpose(),view()将起作用并且尺寸将是正确的。但只有在transpose()之后，这些值才处于正确的位置view()
由于这是一个很容易犯的错误，我在 GitHub 上看到了很多示例，我认为这些示例没有正确完成。问题是网络通常仍然可以学到一些东西。简而言之，我在查看和采用代码片段时并没有更加小心，而view()我认为命令是最大的陷阱。

如果有帮助，这里forward是 GRU 分类器网络的方法：

def forward(self, batch, method='last_step'):
    embeds = self.word_embeddings(batch)
    x = torch.transpose(embeds, 0, 1)
    x, self.hidden = self.gru(x, self.hidden)

    if method == 'last_step':
        x = x[-1]
    elif method == 'average_pooling':
        x = torch.sum(x, dim=0) / len(batch[0])
    elif method == 'max_pooling':
        x, _ = torch.max(x, dim=0)
    else:
        raise Exception('Unknown method.')
    # A series of Linear layers with ReLU and Dropout
    for l in self.linears:
        x = l(x)
    log_probs = F.log_softmax(x, dim=1)
    return log_probs

score 0 · Accepted Answer

TL;博士

这条线train()应该是

output, hidden = model(x, use_softmax=False)

训练时禁用use_softmax，那么模型应该正确训练并且训练 CE 损失将减少到接近 0。

见https://www.kaggle.com/alvations/gru-language-model

python - GRU 语言模型未正确训练

2 回答 2

TL;博士

Related

Reference