我在 theano 用户列表上问了同样的问题,但没有得到答复,只是想知道是否有人可以在这里帮助我。
我正在尝试根据这篇不错的帖子重新实现http://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf的 RNNLM。我尝试了一个玩具测试用例,训练数据是 PTB 训练数据的前 100 句(从http://rnnlm.org/下载),同样的数据也用于评估。
基线:
我使用来自http://rnnlm.org/的 rnnlm-0.4b 训练了 LM 25 次迭代,我得到了测试日志概率:-4107.323481 PPL net: 85.496622
生成基线的命令行是:
$ rnnlm -train ../ptb/ptb.train.txt.toy -valid ../ptb/ptb.train.txt.toy -rnnlm rnn.model.toy -hidden 10 -class 1 -max-iter 25 -independent -beta 0 -direct-order 0
$ rnnlm -rnnlm rnn.model.toy -test ../ptb/ptb.train.txt.toy -independent
使用我的实现,经过 25 次迭代,PPL 有很大的不同:
纪元=24:对数概率=-5374.255371 ppl=337.187731
我还在学习 Theano,我在实现过程中是否遗漏了什么?
谢谢
我的实现可以在这里找到:
#! /usr/bin/env python
import itertools
import codecs
import numpy as np
import nltk
import sys
import time
from datetime import datetime
import theano as theano
import theano.tensor as T
class RNNLM:
def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
# Assign instance variables
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
# Randomly initialize the network parameters
U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
# Theano: Created shared variables
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX)) # @UndefinedVariable
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX)) # @UndefinedVariable
self.W = theano.shared(name='W', value=W.astype(theano.config.floatX)) # @UndefinedVariable
# We store the Theano graph here
self.theano = {}
self.__theano_build__()
def __theano_build__(self):
U, V, W = self.U, self.V, self.W
x = T.ivector('x')
y = T.ivector('y')
def forward_prop_step(x_t, s_t_prev, U, V, W):
s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
o_t = V.dot(s_t)
return [o_t, s_t]
[o,s], updates = theano.scan(
forward_prop_step,
sequences=x,
outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
non_sequences=[U, V, W],
truncate_gradient=self.bptt_truncate,
strict=True)
p_x_given_h = T.nnet.softmax(o)
o_error = T.sum(T.nnet.categorical_crossentropy(p_x_given_h, y))
logp = T.sum(T.log10(p_x_given_h)[T.arange(y.shape[0]), y])
# Gradients
dU = T.grad(o_error, U)
dV = T.grad(o_error, V)
dW = T.grad(o_error, W)
# Assign functions
self.forward_propagation = theano.function([x], p_x_given_h)
self.ce_error = theano.function([x, y], o_error)
self.logp = theano.function([x, y], logp)
# SGD
learning_rate = T.scalar('learning_rate')
self.sgd_step = theano.function([x,y,learning_rate], [],
updates=[(self.U, self.U - learning_rate * dU),
(self.V, self.V - learning_rate * dV),
(self.W, self.W - learning_rate * dW)])
def calculate_total_loss(self, X, Y):
return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of words
num_words = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_words)
def calculate_ppl(self, X, Y):
num_words = np.sum([len(y) for y in Y])
#print "word count: " + str(num_words)
logp = np.sum([self.logp(x,y) for x,y in zip(X,Y)])
ppl = 10 ** (-logp/num_words)
return ppl, logp
def train_with_sgd(model, X_train, y_train, X_valid, y_valid, learning_rate=0.005, nepoch=1, evaluate_loss_after=5):
# We keep track of the losses so we can plot them later
losses = []
num_examples_seen = 0
for epoch in range(nepoch):
# For each training example...
for i in range(len(y_train)):
model.sgd_step(X_train[i], y_train[i], learning_rate)
num_examples_seen += 1
loss = model.calculate_loss(X_train, y_train)
losses.append((num_examples_seen, loss))
time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
loss = model.calculate_loss(X_valid, y_valid)
ppl, logp = model.calculate_ppl(X_valid, y_valid)
print "epoch=%d: log probability=%f ppl=%f" % (epoch,logp,ppl)
# Adjust the learning rate if loss increases
if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
learning_rate = learning_rate * 0.5
print "Setting learning rate to %f" % learning_rate
def load_data():
print "load data..."
train = [("%s %s %s" % (sentence_end_token, x.strip(), sentence_end_token)).split() for x in codecs.open("../ptb/ptb.train.txt.toy", "r", "UTF-8")]
print "Parsed %d sentences." % (len(train))
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*train))
print "Found %d unique words tokens." % len(word_freq.items())
vocab = word_freq.most_common()
index_to_word = [x[0] for x in vocab]
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in train])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in train])
vocabulary_size = len(word_to_index)
return X_train, y_train, vocabulary_size + 1
hidden_dim = 10
sentence_end_token = "eos"
learning_rate = 0.1
nepoc=25
bptt_truncate=100
model_file=None
# Read the data
X_train, y_train, vocabulary_size = load_data()
print "vocabulary_size: " + str(vocabulary_size)
model = RNNLM(vocabulary_size, hidden_dim=hidden_dim)
train_with_sgd(model, X_train, y_train, X_train, y_train, nepoch=nepoc, learning_rate=learning_rate)