python - 扫描中的辍学——Theano

Question

我正在实现一个指针网络的变体，我想在编码/解码步骤中将 dropout 应用于隐藏状态。

这是我的辍学功能：

def dropout_layer(proj, use_noise):
    trng = RandomStreams()
    proj = tensor.switch(use_noise,(proj *trng.binomial(proj.shape,
                         p=0.5, n=1,dtype=proj.dtype)), proj * 0.5)
    return proj

我选择在RandomStreams每个函数调用时初始化一个实例，因为这似乎是 keras 所做的......

这是函数扫描调用：

def _lstm_e(m_, x_, h_, c_):
    preact = tensor.dot(x_, tparams[_p('lstm_en', 'W')]) + tparams[_p('lstm_en', 'b')]
    preact += tensor.dot(h_, tparams[_p('lstm_en', 'U')])

    i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
    f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
    o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
    c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

    c = f * c_ + i * c
    c = m_[:, None] * c + (1. - m_)[:, None] * c_
    h = o * tensor.tanh(c)
    h = m_[:, None] * h + (1. - m_)[:, None] * h_
    h = dropout_layer(h, use_noise)#, trng)

    u = tensor.dot(h, tparams['fc_arg_W']) + tparams['fc_arg_b']


    prob = tensor.nnet.nnet.softmax(u)
    #prob = softmax(at_mask, u)
    return h, c, prob

这是扫描的调用：

ones = tensor.ones((n_samples,), dtype=p.dtype)
h0 = tensor.outer(ones, tparams['lstm_hterm'])  # n_samples * dim_proj; T.tile doesn't work on non-constant reps
c0 = tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj'])
at_mask = tensor.set_subtensor(p_mask[0, :], tensor.constant(1, dtype=config.floatX))
rval, updates = theano.scan(_lstm_e,
                      sequences=[p_mask, p],
                      outputs_info=[h0, c0, None],
                      #non_sequences=trng,
                      name='encoding',
                      n_steps=n_sizes)
hiddens, cells, argt_probs = rval  # hiddens: n_sizes * n_samples * dim_proj
# hiddens = tensor.concatenate([tensor.shape_padleft(h0), hiddens], axis=0)
f_encode = theano.function([p_mask, p], hiddens, updates=updates)

f_argt_probs = theano.function([p_mask, p], argt_probs, updates=updates)

请注意我是如何提供对 theano 函数的更新的，正如人们所说的那样，可以解决类似的问题。

最后，这是错误：

Traceback (most recent call last):
  File "ptrnets_j1_d.py", line 973, in <module>
    reload_model=args.reload,
  File "ptrnets_j1_d.py", line 760, in train_lstm
    (use_noise, p, p_mask, x, x_mask, y, y_mask, at, at_mask, preds, cost, f_encode, f_decode, f_probi, f_argt_preds, anneal) = build_model(tparams, model_options)
  File "ptrnets_j1_d.py", line 690, in build_model
    preds, f_encode, f_decode, f_probi, argt_preds, f_argt_preds = ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options)
  File "ptrnets_j1_d.py", line 368, in ptr_network
    f_decode = theano.function([p_mask, p, x_mask, x], preds, updates=updates)
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function.py", line 320, in function
    output_keys=output_keys)
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/pfunc.py", line 479, in pfunc
    output_keys=output_keys)
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function_module.py", line 1776, in orig_function
    output_keys=output_keys).create(
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function_module.py", line 1428, in __init__
    accept_inplace)
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/compile/function_module.py", line 177, in std_fgraph
    update_mapping=update_mapping)
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/gof/fg.py", line 171, in __init__
    self.__import_r__(output, reason="init")
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/gof/fg.py", line 360, in __import_r__
    self.__import__(variable.owner, reason=reason)
  File "/home/ppotash/virtual-env/local/lib/python2.7/site-packages/theano/gof/fg.py", line 465, in __import__
    detailed_err_msg)
theano.gof.fg.MissingInputError: A variable that is an input to the graph was neither provided as an input to the function nor given a value. A chain of variables leading from this input to an output is [p_mask[t], DimShuffle{0,x}.0, Subtensor{::, ::}.0, Elemwise{mul,no_inplace}.0, Elemwise{add,no_inplace}.0, Shape.0, GPU_mrg_uniform{CudaNdarrayType(float32, matrix),no_inplace}.0]. This chain may not be unique
Backtrace when the variable is created:
  File "ptrnets_j1_d.py", line 973, in <module>
    reload_model=args.reload,
  File "ptrnets_j1_d.py", line 760, in train_lstm
    (use_noise, p, p_mask, x, x_mask, y, y_mask, at, at_mask, preds, cost, f_encode, f_decode, f_probi, f_argt_preds, anneal) = build_model(tparams, model_options)
  File "ptrnets_j1_d.py", line 690, in build_model
    preds, f_encode, f_decode, f_probi, argt_preds, f_argt_preds = ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options)
  File "ptrnets_j1_d.py", line 350, in ptr_network
    n_steps=n_sizes)

一种想法是，当你给它一个符号形状时，用于扫描的函数中使用的二项式生成器会导致错误。这篇文章中已经提到了这一点。

我需要做一些小的改动，还是必须完全修改我使用 dropout 的方式？

编辑更完整的代码版本：

在此函数中调用扫描：

def ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options):
    n_sizes = p.shape[0]
    n_samples = p.shape[1] if p.ndim == 3 else 1
    n_steps = x.shape[0]
    beam_width = xi.shape[0]

    assert p_mask is not None
    assert x_mask is not None
    assert xi_mask is not None


    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        if _x.ndim == 2:
            return _x[:, n * dim:(n + 1) * dim]
        return _x[n * dim:(n + 1) * dim]

    def softmax(m_, x_):
        maxes = tensor.max(x_, axis=0, keepdims=True)
        e = tensor.exp(x_ - maxes)
        dist = e / tensor.sum(e * m_, axis=0)
        return dist

    def _lstm_e(m_, x_, h_, c_):
        preact = tensor.dot(x_, tparams[_p('lstm_en', 'W')]) + tparams[_p('lstm_en', 'b')]
        preact += tensor.dot(h_, tparams[_p('lstm_en', 'U')])

        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_
        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_
        h = dropout_layer(h, use_noise)#, trng)

        u = tensor.dot(h, tparams['fc_arg_W']) + tparams['fc_arg_b']


        prob = tensor.nnet.nnet.softmax(u)
        #prob = softmax(at_mask, u)
        return h, c, prob

    def _lstm_d(m_, x_, h_, c_):
        preact = tensor.dot(x_, tparams[_p('lstm_de', 'W')]) + tparams[_p('lstm_de', 'b')]
        preact += tensor.dot(h_, tparams[_p('lstm_de', 'U')])

        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_
        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_
        h = dropout_layer(h, use_noise)#, trng)

        return h, c

    def _ptr_probs(xm_, x_, h_, c_, _, hprevs, hprevs_m):
        xemb = p[x_, tensor.arange(n_samples), :]  # n_samples * dim_proj
        h, c = _lstm_d(xm_, xemb, h_, c_)
        u = tensor.dot(hprevs, tparams['ptr_W1']) + tensor.dot(h, tparams['ptr_W2'])  # n_steps * n_samples * dim
        u = tensor.tanh(u)  # n_sizes * n_samples * dim_proj
        u = tensor.dot(u, tparams['ptr_v'])  # n_sizes * n_samples
        # prob = tensor.nnet.softmax(u.T).T  # n_sizes * n_samples
        prob = softmax(hprevs_m, u)
        return h, c, prob

    ones = tensor.ones((n_samples,), dtype=p.dtype)
    h0 = tensor.outer(ones, tparams['lstm_hterm'])  # n_samples * dim_proj; T.tile doesn't work on non-constant reps
    c0 = tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj'])
    at_mask = tensor.set_subtensor(p_mask[0, :], tensor.constant(1, dtype=config.floatX))
    rval, updates = theano.scan(_lstm_e,
                          sequences=[p_mask, p],
                          outputs_info=[h0, c0, None],
                          #non_sequences=trng,
                          name='encoding',
                          n_steps=n_sizes)
    hiddens, cells, argt_probs = rval  # hiddens: n_sizes * n_samples * dim_proj
    # hiddens = tensor.concatenate([tensor.shape_padleft(h0), hiddens], axis=0)
    f_encode = theano.function([p_mask, p], hiddens, updates=updates)

    f_argt_probs = theano.function([p_mask, p], argt_probs, updates=updates)
    # decoding
    hiddens_mask = tensor.set_subtensor(p_mask[0, :], tensor.constant(1, dtype=config.floatX))
    # hiddens_mask = tensor.concatenate([tensor.ones((1, n_samples), dtype=config.floatX), p_mask], axis=0)
    rval, updates2 = theano.scan(_ptr_probs,
                          sequences=[x_mask, x],
                          outputs_info=[hiddens[-1],  # n_samples * dim_proj
                                        tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj']),  # cells[-1],
                                        tensor.alloc(numpy_floatX(0.), n_sizes, n_samples)],
                                non_sequences=[hiddens, hiddens_mask],
                          name='decoding',
                          n_steps=n_steps)
    preds = rval[2]
    f_decode = theano.function([p_mask, p, x_mask, x], preds, updates=updates2)

    u0 = tensor.alloc(numpy_floatX(0.), hidi.shape[0], beam_width)
    hiddeni, celli, probi = _ptr_probs(xi_mask, xi, hidi, celi, u0, hids, hiddens_mask)
    f_probi = theano.function(inputs=[xi_mask, xi, hidi, celi, hids, p_mask, p], outputs=[hiddeni, celli, probi])

    return preds, f_encode, f_decode, f_probi, argt_probs, f_argt_probs

并调用该函数ptr_network：

def build_model(tparams, options):
    #trng = RandomStreams()
    # for training
    p = tensor.tensor3('p', dtype=config.floatX)  # Problems, n_sizes * n_samples * data_dim
    p_mask = tensor.matrix('p_mask', dtype=config.floatX)
    x = tensor.matrix('x', dtype='int64')  # n_steps * n_samples
    x_mask = tensor.matrix('x_mask', dtype=config.floatX)
    y = tensor.matrix('y', dtype='int64')  # n_steps * n_samples
    y_mask = tensor.matrix('y_mask', dtype=config.floatX)
    at = tensor.matrix('at', dtype='int64')  # n_steps * n_samples
    at_mask = tensor.matrix('at_mask', dtype=config.floatX)

    use_noise = theano.shared(numpy_floatX(0.))

    anneal = tensor.scalar('anneal', dtype=config.floatX)

    # for generation
    hidi = tensor.matrix('hidi', dtype=config.floatX)
    celi = tensor.matrix('celi', dtype=config.floatX)
    hids = tensor.tensor3('hids', dtype=config.floatX)
    xi = tensor.vector('xi', dtype='int64')
    xi_mask = tensor.vector('xi_mask', dtype=config.floatX)

    n_steps = x.shape[0]
    n_samples = x.shape[1]

    preds, f_encode, f_decode, f_probi, argt_preds, f_argt_preds = ptr_network(tparams, p, p_mask, x, x_mask, xi, xi_mask, hidi, celi, hids, use_noise, options)

python - 扫描中的辍学——Theano

0 回答 0

Related

Reference