python - 使用 theano 的 RNNLM

Question

我在 theano 用户列表上问了同样的问题，但没有得到答复，只是想知道是否有人可以在这里帮助我。

我正在尝试根据这篇不错的帖子重新实现http://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf的 RNNLM。我尝试了一个玩具测试用例，训练数据是 PTB 训练数据的前 100 句（从http://rnnlm.org/下载），同样的数据也用于评估。

基线：

我使用来自http://rnnlm.org/的 rnnlm-0.4b 训练了 LM 25 次迭代，我得到了测试日志概率：-4107.323481 PPL net: 85.496622

生成基线的命令行是：

$ rnnlm -train ../ptb/ptb.train.txt.toy -valid ../ptb/ptb.train.txt.toy -rnnlm rnn.model.toy -hidden 10 -class 1 -max-iter 25 -independent -beta 0 -direct-order 0
$ rnnlm -rnnlm rnn.model.toy -test ../ptb/ptb.train.txt.toy -independent

使用我的实现，经过 25 次迭代，PPL 有很大的不同：

纪元=24：对数概率=-5374.255371 ppl=337.187731

我还在学习 Theano，我在实现过程中是否遗漏了什么？

谢谢

我的实现可以在这里找到：

#! /usr/bin/env python

import itertools
import codecs
import numpy as np
import nltk
import sys
import time
from datetime import datetime
import theano as theano
import theano.tensor as T

class RNNLM:

    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        # Theano: Created shared variables
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))  # @UndefinedVariable
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))  # @UndefinedVariable
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))      # @UndefinedVariable
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()

    def __theano_build__(self):
        U, V, W = self.U, self.V, self.W
        x = T.ivector('x')
        y = T.ivector('y')
        def forward_prop_step(x_t, s_t_prev, U, V, W):
            s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
            o_t = V.dot(s_t)

            return [o_t, s_t]
        [o,s], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
            non_sequences=[U, V, W],
            truncate_gradient=self.bptt_truncate,
            strict=True)
        p_x_given_h = T.nnet.softmax(o)

        o_error = T.sum(T.nnet.categorical_crossentropy(p_x_given_h, y))
        logp = T.sum(T.log10(p_x_given_h)[T.arange(y.shape[0]), y])


        # Gradients
        dU = T.grad(o_error, U)
        dV = T.grad(o_error, V)
        dW = T.grad(o_error, W)

        # Assign functions
        self.forward_propagation = theano.function([x], p_x_given_h)
        self.ce_error = theano.function([x, y], o_error)
        self.logp = theano.function([x, y], logp)
        # SGD
        learning_rate = T.scalar('learning_rate')
        self.sgd_step = theano.function([x,y,learning_rate], [], 
                      updates=[(self.U, self.U - learning_rate * dU),
                              (self.V, self.V - learning_rate * dV),
                              (self.W, self.W - learning_rate * dW)])

    def calculate_total_loss(self, X, Y):
        return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])

    def calculate_loss(self, X, Y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y) for y in Y])
        return self.calculate_total_loss(X,Y)/float(num_words)   

    def calculate_ppl(self, X, Y):
        num_words = np.sum([len(y) for y in Y])
        #print "word count: " + str(num_words)
        logp = np.sum([self.logp(x,y) for x,y in zip(X,Y)])
        ppl = 10 ** (-logp/num_words)
        return ppl, logp


def train_with_sgd(model, X_train, y_train, X_valid, y_valid, learning_rate=0.005, nepoch=1, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # For each training example...
        for i in range(len(y_train)):
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

        loss = model.calculate_loss(X_train, y_train)
        losses.append((num_examples_seen, loss))
        time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

        loss = model.calculate_loss(X_valid, y_valid)
        ppl, logp = model.calculate_ppl(X_valid, y_valid)

        print "epoch=%d: log probability=%f ppl=%f" % (epoch,logp,ppl)
        # Adjust the learning rate if loss increases
        if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
            learning_rate = learning_rate * 0.5  
            print "Setting learning rate to %f" % learning_rate

def load_data():
    print "load data..."

    train = [("%s %s %s" % (sentence_end_token, x.strip(), sentence_end_token)).split() for x in codecs.open("../ptb/ptb.train.txt.toy", "r", "UTF-8")]

    print "Parsed %d sentences." % (len(train))

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*train))
    print "Found %d unique words tokens." % len(word_freq.items())

    vocab = word_freq.most_common()
    index_to_word = [x[0] for x in vocab]
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

    X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in train])
    y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in train])


    vocabulary_size = len(word_to_index)

    return  X_train, y_train, vocabulary_size + 1 



hidden_dim = 10
sentence_end_token = "eos"
learning_rate = 0.1
nepoc=25
bptt_truncate=100
model_file=None

# Read the data 
X_train, y_train, vocabulary_size = load_data()
print "vocabulary_size: " + str(vocabulary_size)
model = RNNLM(vocabulary_size, hidden_dim=hidden_dim)
train_with_sgd(model, X_train, y_train, X_train, y_train, nepoch=nepoc, learning_rate=learning_rate)

score 0 · Accepted Answer

我正在解决同样的问题。我不确定我是否找到了所有问题，但如果你打印出 V 权重，你会发现它们比其他的大得多。我除以词汇量而不是隐藏权重的数量。它有帮助，但网络仍然没有一路收敛。

您需要打印所有权重的值，此代码将帮助您入门。

我尝试添加 L1 和 L2 正则化，但没有帮助。

V = np.random.uniform(-np.sqrt(1./n_unique_words), np.sqrt(1./n_unique_words), (n_unique_words, n_hidden))

def sum_weights(self):
    v = self.V.sum()
    return v.eval()

def abs_sum_weights(self):
    v = (np.abs(self.V)).sum()
    return v.eval()

这仍在进行中。我认为你需要的不仅仅是我给你的片段。如果你这样做，剩下的就在这里。https://github.com/timestocome/AliceInWonderland如果您返回一些更新，您会发现 L1/L2 代码。

python - 使用 theano 的 RNNLM

1 回答 1

Related

Reference