我一直致力于从头开始使用 Tensorflow 实现一个简单的编码器-解码器模型。我的目标是了解模型的内部工作原理,因此我从头开始实施它。我使用 GRU(门控循环单元)作为 RNN 单元。
for epoch_idx in range(EPOCHS):
avg_loss = 0.0
for batch_idx in range(num_batches):
hidden = tf.zeros([BATCH_SIZE, HIDDEN_SIZE])
with tf.GradientTape() as tape:
curr_enc_batch = get_batch(en_seqs, batch_idx, BATCH_SIZE, en_vocab, en_word_index[EOS_TOKEN])
for t in range(SEQ_LEN):
x_t_partial = tf.squeeze(curr_enc_batch[:, t, :]) # BATCH_SIZE, en_vocab_size
x_t = tf.concat([hidden, x_t_partial], axis=1) # BATCH_SIZE, (HIDDEN_SIZE + en_vocab_size)
z_t = tf.sigmoid(tf.matmul(x_t, encoder['Wz']) + tf.matmul(hidden, encoder['Uz']) + encoder['bz']) # BATCH_SIZE, HIDDEN_SIZE
r_t = tf.sigmoid(tf.matmul(x_t, encoder['Wr']) + tf.matmul(hidden, encoder['Ur']) + encoder['br']) # BATCH_SIZE, HIDDEN_SIZE
h_hat_t = tf.tanh(tf.matmul(x_t, encoder['Wh']) + r_t * tf.matmul(hidden, encoder['Uh']) + encoder['bh']) # BATCH_SIZE, HIDDEN_SIZE
hidden = (1 - z_t) * hidden + z_t * h_hat_t # BATCH_SIZE, HIDDEN_SIZE
# Now "hidden" is the context vector
curr_dec_batch = get_batch(fr_seqs, batch_idx, BATCH_SIZE, fr_vocab, fr_word_index[EOS_TOKEN])
dec_x_t_partial = tf.zeros([BATCH_SIZE, fr_vocab_size]) # First input to the decoder is empty
loss = 0.0
for t in range(SEQ_LEN):
dec_x_t = tf.concat([hidden, dec_x_t_partial], axis=1)
dec_z_t = tf.sigmoid(tf.matmul(dec_x_t, decoder['Wz']) + tf.matmul(hidden, decoder['Uz']) + decoder['bz']) # BATCH_SIZE, HIDDEN_SIZE
dec_r_t = tf.sigmoid(tf.matmul(dec_x_t, decoder['Wr']) + tf.matmul(hidden, decoder['Ur']) + decoder['br']) # BATCH_SIZE, HIDDEN_SIZE
dec_h_hat_t = tf.tanh(tf.matmul(dec_x_t, decoder['Wh']) + dec_r_t * tf.matmul(hidden, decoder['Uh']) + decoder['bh']) # BATCH_SIZE, HIDDEN_SIZE
hidden = (1 - dec_z_t) * hidden + dec_z_t * dec_h_hat_t # BATCH_SIZE, HIDDEN_SIZE
logit = tf.tanh(tf.matmul(hidden, decoder['Wy']) + decoder['by']) # BATCH_SIZE, fr_vocab_size
y_hat_t = tf.nn.softmax(logit) # BATCH_SIZE, fr_vocab_size
y_t = tf.squeeze(curr_dec_batch[:, t, :])
loss += tf.keras.losses.categorical_crossentropy(y_t, y_hat_t)
dec_x_t_partial = y_t
print(f"Epoch: {epoch_idx}, Batch: {batch_idx}, batch_loss: {tf.reduce_mean(loss):.4f}")
gradients = tape.gradient(loss, [encoder['Wz'], encoder['Wr'], encoder['Wh'], encoder['Uz'], encoder['Ur'],
encoder['Uh'], encoder['bz'], encoder['br'], encoder['bh'], decoder['Wz'],
decoder['Wr'], decoder['Wh'], decoder['Uz'],
decoder['Ur'], decoder['Uh'], decoder['bz'], decoder['br'], decoder['bh'],
decoder['Wy'], decoder['by']])
optimizer.apply_gradients(
zip(gradients, [encoder['Wz'], encoder['Wr'], encoder['Wh'], encoder['Uz'], encoder['Ur'],
encoder['Uh'], encoder['bz'], encoder['br'], encoder['bh'], decoder['Wz'],
decoder['Wr'], decoder['Wh'], decoder['Uz'],
decoder['Ur'], decoder['Uh'], decoder['bz'], decoder['br'], decoder['bh'],
decoder['Wy'], decoder['by']]))
由于某种原因,所有编码器权重矩阵的梯度都是,但对于解码器权重矩阵来说没问题。我猜反向传播没有通过上下文向量。但是我如何确保反向传播在编码器端也能正常工作呢?谢谢!nan
hidden