我正在实现一个指针网络的变体,我想在编码/解码步骤中将 dropout 应用于隐藏状态。
这是我的辍学功能:
def dropout_layer(proj, use_noise):
trng = RandomStreams()
proj = tensor.switch(use_noise,(proj *trng.binomial(proj.shape,
p=0.5, n=1,dtype=proj.dtype)), proj * 0.5)
return proj
我选择在RandomStreams
每个函数调用时初始化一个实例,因为这似乎是 keras 所做的......
这是函数扫描调用:
def _lstm_e(m_, x_, h_, c_):
preact = tensor.dot(x_, tparams[_p('lstm_en', 'W')]) + tparams[_p('lstm_en', 'b')]
preact += tensor.dot(h_, tparams[_p('lstm_en', 'U')])
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
h = dropout_layer(h, use_noise)#, trng)
u = tensor.dot(h, tparams['fc_arg_W']) + tparams['fc_arg_b']
prob = tensor.nnet.nnet.softmax(u)
#prob = softmax(at_mask, u)
return h, c, prob
这是扫描的调用:
ones = tensor.ones((n_samples,), dtype=p.dtype)
h0 = tensor.outer(ones, tparams['lstm_hterm']) # n_samples * dim_proj; T.tile doesn't work on non-constant reps
c0 = tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj'])
at_mask = tensor.set_subtensor(p_mask[0, :], tensor.constant(1, dtype=config.floatX))
rval, updates = theano.scan(_lstm_e,
sequences=[p_mask, p],
outputs_info=[h0, c0, None],
#non_sequences=trng,
name='encoding',
n_steps=n_sizes)
hiddens, cells, argt_probs = rval # hiddens: n_sizes * n_samples * dim_proj
# hiddens = tensor.concatenate([tensor.shape_padleft(h0), hiddens], axis=0)
f_encode = theano.function([p_mask, p], hiddens, updates=updates)
f_argt_probs = theano.function([p_mask, p], argt_probs, updates=updates)
请注意我是如何提供对 theano 函数的更新的,正如人们所说的那样,可以解决类似的问题。
最后,这是错误:
Traceback (most recent call last):
File "ptrnets_j1_d.py", line 973, in <module>
reload_model=args.reload,
File "ptrnets_j1_d.py", line 760, in train_lstm
(use_noise, p, p_mask, x, x_mask, y, y_mask, at, at_mask, preds, cost, f_encode, f_decode, f_probi, f_argt_preds, anneal) = build_model(tparams, model_options)
File "ptrnets_j1_d.py", line 690, in build_model
preds, f_encode, f_decode, f_probi, argt_preds, f_argt_preds = ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options)
File "ptrnets_j1_d.py", line 368, in ptr_network
f_decode = theano.function([p_mask, p, x_mask, x], preds, updates=updates)
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function.py", line 320, in function
output_keys=output_keys)
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/pfunc.py", line 479, in pfunc
output_keys=output_keys)
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function_module.py", line 1776, in orig_function
output_keys=output_keys).create(
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function_module.py", line 1428, in __init__
accept_inplace)
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function_module.py", line 177, in std_fgraph
update_mapping=update_mapping)
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/gof/fg.py", line 171, in __init__
self.__import_r__(output, reason="init")
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/gof/fg.py", line 360, in __import_r__
self.__import__(variable.owner, reason=reason)
File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/gof/fg.py", line 465, in __import__
detailed_err_msg)
theano.gof.fg.MissingInputError: A variable that is an input to the graph was neither provided as an input to the function nor given a value. A chain of variables leading from this input to an output is [p_mask[t], DimShuffle{0,x}.0, Subtensor{::, ::}.0, Elemwise{mul,no_inplace}.0, Elemwise{add,no_inplace}.0, Shape.0, GPU_mrg_uniform{CudaNdarrayType(float32, matrix),no_inplace}.0]. This chain may not be unique
Backtrace when the variable is created:
File "ptrnets_j1_d.py", line 973, in <module>
reload_model=args.reload,
File "ptrnets_j1_d.py", line 760, in train_lstm
(use_noise, p, p_mask, x, x_mask, y, y_mask, at, at_mask, preds, cost, f_encode, f_decode, f_probi, f_argt_preds, anneal) = build_model(tparams, model_options)
File "ptrnets_j1_d.py", line 690, in build_model
preds, f_encode, f_decode, f_probi, argt_preds, f_argt_preds = ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options)
File "ptrnets_j1_d.py", line 350, in ptr_network
n_steps=n_sizes)
一种想法是,当你给它一个符号形状时,用于扫描的函数中使用的二项式生成器会导致错误。这篇文章中已经提到了这一点。
我需要做一些小的改动,还是必须完全修改我使用 dropout 的方式?
编辑 更完整的代码版本:
在此函数中调用扫描:
def ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options):
n_sizes = p.shape[0]
n_samples = p.shape[1] if p.ndim == 3 else 1
n_steps = x.shape[0]
beam_width = xi.shape[0]
assert p_mask is not None
assert x_mask is not None
assert xi_mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
if _x.ndim == 2:
return _x[:, n * dim:(n + 1) * dim]
return _x[n * dim:(n + 1) * dim]
def softmax(m_, x_):
maxes = tensor.max(x_, axis=0, keepdims=True)
e = tensor.exp(x_ - maxes)
dist = e / tensor.sum(e * m_, axis=0)
return dist
def _lstm_e(m_, x_, h_, c_):
preact = tensor.dot(x_, tparams[_p('lstm_en', 'W')]) + tparams[_p('lstm_en', 'b')]
preact += tensor.dot(h_, tparams[_p('lstm_en', 'U')])
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
h = dropout_layer(h, use_noise)#, trng)
u = tensor.dot(h, tparams['fc_arg_W']) + tparams['fc_arg_b']
prob = tensor.nnet.nnet.softmax(u)
#prob = softmax(at_mask, u)
return h, c, prob
def _lstm_d(m_, x_, h_, c_):
preact = tensor.dot(x_, tparams[_p('lstm_de', 'W')]) + tparams[_p('lstm_de', 'b')]
preact += tensor.dot(h_, tparams[_p('lstm_de', 'U')])
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
h = dropout_layer(h, use_noise)#, trng)
return h, c
def _ptr_probs(xm_, x_, h_, c_, _, hprevs, hprevs_m):
xemb = p[x_, tensor.arange(n_samples), :] # n_samples * dim_proj
h, c = _lstm_d(xm_, xemb, h_, c_)
u = tensor.dot(hprevs, tparams['ptr_W1']) + tensor.dot(h, tparams['ptr_W2']) # n_steps * n_samples * dim
u = tensor.tanh(u) # n_sizes * n_samples * dim_proj
u = tensor.dot(u, tparams['ptr_v']) # n_sizes * n_samples
# prob = tensor.nnet.softmax(u.T).T # n_sizes * n_samples
prob = softmax(hprevs_m, u)
return h, c, prob
ones = tensor.ones((n_samples,), dtype=p.dtype)
h0 = tensor.outer(ones, tparams['lstm_hterm']) # n_samples * dim_proj; T.tile doesn't work on non-constant reps
c0 = tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj'])
at_mask = tensor.set_subtensor(p_mask[0, :], tensor.constant(1, dtype=config.floatX))
rval, updates = theano.scan(_lstm_e,
sequences=[p_mask, p],
outputs_info=[h0, c0, None],
#non_sequences=trng,
name='encoding',
n_steps=n_sizes)
hiddens, cells, argt_probs = rval # hiddens: n_sizes * n_samples * dim_proj
# hiddens = tensor.concatenate([tensor.shape_padleft(h0), hiddens], axis=0)
f_encode = theano.function([p_mask, p], hiddens, updates=updates)
f_argt_probs = theano.function([p_mask, p], argt_probs, updates=updates)
# decoding
hiddens_mask = tensor.set_subtensor(p_mask[0, :], tensor.constant(1, dtype=config.floatX))
# hiddens_mask = tensor.concatenate([tensor.ones((1, n_samples), dtype=config.floatX), p_mask], axis=0)
rval, updates2 = theano.scan(_ptr_probs,
sequences=[x_mask, x],
outputs_info=[hiddens[-1], # n_samples * dim_proj
tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj']), # cells[-1],
tensor.alloc(numpy_floatX(0.), n_sizes, n_samples)],
non_sequences=[hiddens, hiddens_mask],
name='decoding',
n_steps=n_steps)
preds = rval[2]
f_decode = theano.function([p_mask, p, x_mask, x], preds, updates=updates2)
u0 = tensor.alloc(numpy_floatX(0.), hidi.shape[0], beam_width)
hiddeni, celli, probi = _ptr_probs(xi_mask, xi, hidi, celi, u0, hids, hiddens_mask)
f_probi = theano.function(inputs=[xi_mask, xi, hidi, celi, hids, p_mask, p], outputs=[hiddeni, celli, probi])
return preds, f_encode, f_decode, f_probi, argt_probs, f_argt_probs
并调用该函数ptr_network
:
def build_model(tparams, options):
#trng = RandomStreams()
# for training
p = tensor.tensor3('p', dtype=config.floatX) # Problems, n_sizes * n_samples * data_dim
p_mask = tensor.matrix('p_mask', dtype=config.floatX)
x = tensor.matrix('x', dtype='int64') # n_steps * n_samples
x_mask = tensor.matrix('x_mask', dtype=config.floatX)
y = tensor.matrix('y', dtype='int64') # n_steps * n_samples
y_mask = tensor.matrix('y_mask', dtype=config.floatX)
at = tensor.matrix('at', dtype='int64') # n_steps * n_samples
at_mask = tensor.matrix('at_mask', dtype=config.floatX)
use_noise = theano.shared(numpy_floatX(0.))
anneal = tensor.scalar('anneal', dtype=config.floatX)
# for generation
hidi = tensor.matrix('hidi', dtype=config.floatX)
celi = tensor.matrix('celi', dtype=config.floatX)
hids = tensor.tensor3('hids', dtype=config.floatX)
xi = tensor.vector('xi', dtype='int64')
xi_mask = tensor.vector('xi_mask', dtype=config.floatX)
n_steps = x.shape[0]
n_samples = x.shape[1]
preds, f_encode, f_decode, f_probi, argt_preds, f_argt_preds = ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options)