根据官方 Theano 教程 ( http://deeplearning.net/tutorial/code/lstm.py ) 中提供的 LSTM 代码,我更改了 LSTM 层代码(即函数lstm_layer()
和param_init_lstm()
)以执行 GRU。
提供的 LSTM 代码训练得很好,但不是我编码的 GRU:使用 LSTM 的训练集的准确度上升到 1(训练成本 = 0),而使用 GRU 时它停滞在 0.7(训练成本 = 0.3)。
下面是我用于 GRU 的代码。我保留了与教程中相同的函数名称,以便可以直接将代码复制粘贴到其中。什么可以解释 GRU 的糟糕表现?
import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
"""
GRU
"""
W = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the input in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate
axis=1)
params[_p(prefix, 'W')] = W
U = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the previous hidden state in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
axis=1)
params[_p(prefix, 'U')] = U
b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate
params[_p(prefix, 'b')] = b.astype(config.floatX)
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
def _step(m_, x_, h_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate
U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
x_h_t = _slice( x_, 2, options['dim_proj'])
h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
h = (1. - u) * h_ + u * h_t_temp
h = m_[:,None] * h + (1. - m_)[:,None] * h_
return h
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]