0

我一直致力于从头开始使用 Tensorflow 实现一个简单的编码器-解码器模型。我的目标是了解模型的内部工作原理,因此我从头开始实施它。我使用 GRU(门控循环单元)作为 RNN 单元。

for epoch_idx in range(EPOCHS):
    avg_loss = 0.0
    for batch_idx in range(num_batches):
        hidden = tf.zeros([BATCH_SIZE, HIDDEN_SIZE])
        with tf.GradientTape() as tape:
            curr_enc_batch = get_batch(en_seqs, batch_idx, BATCH_SIZE, en_vocab, en_word_index[EOS_TOKEN])
            for t in range(SEQ_LEN):
                x_t_partial = tf.squeeze(curr_enc_batch[:, t, :])  # BATCH_SIZE, en_vocab_size
                x_t = tf.concat([hidden, x_t_partial], axis=1)  # BATCH_SIZE, (HIDDEN_SIZE + en_vocab_size)
                z_t = tf.sigmoid(tf.matmul(x_t, encoder['Wz']) + tf.matmul(hidden, encoder['Uz']) + encoder['bz'])  # BATCH_SIZE, HIDDEN_SIZE
                r_t = tf.sigmoid(tf.matmul(x_t, encoder['Wr']) + tf.matmul(hidden, encoder['Ur']) + encoder['br'])  # BATCH_SIZE, HIDDEN_SIZE
                h_hat_t = tf.tanh(tf.matmul(x_t, encoder['Wh']) + r_t * tf.matmul(hidden, encoder['Uh']) + encoder['bh'])  # BATCH_SIZE, HIDDEN_SIZE
                hidden = (1 - z_t) * hidden + z_t * h_hat_t  # BATCH_SIZE, HIDDEN_SIZE

            # Now "hidden" is the context vector
            curr_dec_batch = get_batch(fr_seqs, batch_idx, BATCH_SIZE, fr_vocab, fr_word_index[EOS_TOKEN])
            dec_x_t_partial = tf.zeros([BATCH_SIZE, fr_vocab_size])  # First input to the decoder is empty
            loss = 0.0
            for t in range(SEQ_LEN):
                dec_x_t = tf.concat([hidden, dec_x_t_partial], axis=1)
                dec_z_t = tf.sigmoid(tf.matmul(dec_x_t, decoder['Wz']) + tf.matmul(hidden, decoder['Uz']) + decoder['bz'])  # BATCH_SIZE, HIDDEN_SIZE
                dec_r_t = tf.sigmoid(tf.matmul(dec_x_t, decoder['Wr']) + tf.matmul(hidden, decoder['Ur']) + decoder['br'])  # BATCH_SIZE, HIDDEN_SIZE
                dec_h_hat_t = tf.tanh(tf.matmul(dec_x_t, decoder['Wh']) + dec_r_t * tf.matmul(hidden, decoder['Uh']) + decoder['bh'])  # BATCH_SIZE, HIDDEN_SIZE
                hidden = (1 - dec_z_t) * hidden + dec_z_t * dec_h_hat_t  # BATCH_SIZE, HIDDEN_SIZE
                logit = tf.tanh(tf.matmul(hidden, decoder['Wy']) + decoder['by'])  # BATCH_SIZE, fr_vocab_size
                y_hat_t = tf.nn.softmax(logit)  # BATCH_SIZE, fr_vocab_size
                y_t = tf.squeeze(curr_dec_batch[:, t, :])
                loss += tf.keras.losses.categorical_crossentropy(y_t, y_hat_t)
                dec_x_t_partial = y_t

        print(f"Epoch: {epoch_idx}, Batch: {batch_idx}, batch_loss: {tf.reduce_mean(loss):.4f}")
        gradients = tape.gradient(loss, [encoder['Wz'], encoder['Wr'], encoder['Wh'], encoder['Uz'], encoder['Ur'],
                                         encoder['Uh'], encoder['bz'], encoder['br'], encoder['bh'], decoder['Wz'],
                                         decoder['Wr'], decoder['Wh'], decoder['Uz'],
                                         decoder['Ur'], decoder['Uh'], decoder['bz'], decoder['br'], decoder['bh'],
                                         decoder['Wy'], decoder['by']])
        optimizer.apply_gradients(
            zip(gradients, [encoder['Wz'], encoder['Wr'], encoder['Wh'], encoder['Uz'], encoder['Ur'],
                            encoder['Uh'], encoder['bz'], encoder['br'], encoder['bh'], decoder['Wz'],
                            decoder['Wr'], decoder['Wh'], decoder['Uz'],
                            decoder['Ur'], decoder['Uh'], decoder['bz'], decoder['br'], decoder['bh'],
                            decoder['Wy'], decoder['by']]))

由于某种原因,所有编码器权重矩阵的梯度都是,但对于解码器权重矩阵来说没问题。我猜反向传播没有通过上下文向量。但是我如何确保反向传播在编码器端也能正常工作呢?谢谢!nanhidden

4

0 回答 0