python-3.x - Tensorflow gradientTape 在计算相同的梯度两次时给出不同的结果

Question

我正在尝试使用 TF 2.0。我想记录我的神经网络的梯度和权重范数。为此，我使用以下代码。

def get_weights_norm(layer, optim_iters, log=False):
    """
    Calculate norm of layer's weights and save it as tf.summary
    if log = true it also print it
    """
    w_l = layer.trainable_weights
    name = layer.name
    if log:
        print("Layer " + name)
    for w in w_l:
        shape = str(w.shape.as_list())
        norm = tf.norm(w.numpy(), name="norm").numpy()
        s_name = name + "_layer_norm/ shape-" + shape
        tf.summary.scalar(s_name, norm, step=optim_iters)
        if log:
            print("\tWeights norm: %s shape: %s" % (norm, shape))

def get_grad_norm(g_tape, loss_value, layer, optim_iters, log=False):
    """
    Calculate norm of gradients of the loss respect to layer weights weights and save it as tf.summary
    if log = true it also print it
    """
    grad = g_tape.gradient(loss_value, layer.trainable_weights)
    name = layer.name
    if log:
        print("Layer " + name)
    for w in grad:
        shape = str(w.shape.as_list())
        norm = tf.norm(w.numpy(), name="norm").numpy()
        s_name = name + "_layer_grad_norm/ shape-" + shape
        tf.summary.scalar(s_name, norm, step=optim_iters)
        if log:
            print("\tGrad norm: %s shape: %s" % (norm, shape))
            print("{:.2E}".format(norm))

这是训练循环：

for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        # Open a GradientTape to record the operations run
        # during the forward pass, which enables autodifferentiation.
        with tf.GradientTape(persistent=True) as tape:

            # Run the forward pass of the layer.
            logits = model(x_batch_train)  # Logits for this minibatch         
            # Compute the loss value for this minibatch.
            loss_value = loss_fn(y_batch_train, logits)

        # Use the gradient tape to automatically retrieve
        # the gradients of the trainable variables with respect to the loss.
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

    g_bidi = tape.gradient(loss_value, model.get_layer("bi_lstm").trainable_weights)
    g_out = tape.gradient(loss_value, model.get_layer("output").trainable_weights)  
    g_dense = tape.gradient(loss_value, model.get_layer("dense").trainable_weights)

    print("Out Layer")
    w_out = model.get_layer("output").trainable_weights
    print(model.get_layer("output").name)
    print(float(tf.norm(w_out[0].numpy(), name="norm")))
    print("\tWeights norm: %s shape: %s" % (tf.norm(w_out[0].numpy(), name="norm"), w_out[0].shape))
    print("\tWeights norm: %s shape: %s" % (tf.norm(w_out[1].numpy(), name="norm"), w_out[1].shape))
    print()
    print("\t ||dE/dw_out|| = %s shape: %s" % (tf.norm(g_out[0].numpy(), name='norm'), g_out[0].shape))
    print("\t ||dE/db_out|| = %s shape: %s" % (tf.norm(g_out[1].numpy(), name='norm'), g_out[1].shape))
    get_weights_norm(model.get_layer("output"), optimizer.iterations, True)
    get_grad_norm(tape, loss_value, model.get_layer("output"), optimizer.iterations, True)
    print()
    print()
    print("Bidirect")
    w_bid = model.get_layer("bi_lstm").trainable_weights
    print("\tWeights fwd norm: %s shape %s:" % (tf.norm(w_bid[0].numpy(), name="norm"), w_bid[0].shape))
    print("\tWeights fwd_rec norm: %s shape %s:" % (tf.norm(w_bid[1].numpy(), name="norm"), w_bid[1].shape))
    print("\tWeights fwd bias norm: %s shape %s:" % (tf.norm(w_bid[2].numpy(), name="norm"), w_bid[2].shape))
    print("\tWeights bwd norm: %s shape %s:" % (tf.norm(w_bid[3].numpy(), name="norm"), w_bid[3].shape))
    print("\tWeights bwd_rec norm: %s shape %s:" % (tf.norm(w_bid[4].numpy(), name="norm"), w_bid[4].shape))
    print("\tWeights bwd bias norm: %s shape %s:" % (tf.norm(w_bid[5].numpy(), name="norm"), w_bid[5].shape))
    print()
    print("\t ||dE/dw_forw|| = %s shape: %s" % (tf.norm(g_bidi[0].numpy(), name='norm'), g_bidi[0].shape))
    print("\t ||dE/dw_forw_rec|| = %s shape: %s" % (tf.norm(g_bidi[1].numpy(), name='norm'), g_bidi[1].shape))
    print("\t ||dE/dw_forw_bias|| = %s shape: %s" % (tf.norm(g_bidi[2].numpy(), name='norm'), g_bidi[2].shape))
    print("\t ||dE/dw_bckw|| = %s shape: %s" % (tf.norm(g_bidi[3].numpy(), name='norm'), g_bidi[3].shape))
    print("\t ||dE/dw_bkw_rec|| = %s shape: %s" % (tf.norm(g_bidi[4].numpy(), name='norm'), g_bidi[4].shape))
    print("\t ||dE/dw_bkw_bias|| = %s shape: %s" % (tf.norm(g_bidi[5].numpy(), name='norm'), g_bidi[5].shape))
    get_weights_norm(model.get_layer("bi_lstm"), optimizer.iterations, True)
    get_grad_norm(tape, loss_value, model.get_layer("bi_lstm"), optimizer.iterations, True)

问题： 运行脚本时，输出层的梯度范数得到相同的值，但双向层的范数（bi_lstm）得到不同的值

这是输出：

    Weights norm: tf.Tensor(0.33847392, shape=(), dtype=float32) shape: (64, 1)
    Weights norm: tf.Tensor(88.14, shape=(), dtype=float32) shape: (1,)

     ||dE/dw_out|| = tf.Tensor(1.7349662, shape=(), dtype=float32) shape: (64, 1)
     ||dE/db_out|| = tf.Tensor(0.31759995, shape=(), dtype=float32) shape: (1,)
Layer output
    Weights norm: 0.33847392 shape: [64, 1]
    Weights norm: 88.14 shape: [1]


Bidirect
    Weights fwd norm: tf.Tensor(13.112313, shape=(), dtype=float32) shape (256, 128):
    Weights fwd_rec norm: tf.Tensor(5.691354, shape=(), dtype=float32) shape (32, 128):
    Weights fwd bias norm: tf.Tensor(11.340048, shape=(), dtype=float32) shape (128,):
    Weights bwd norm: tf.Tensor(13.147353, shape=(), dtype=float32) shape (256, 128):
    Weights bwd_rec norm: tf.Tensor(5.685838, shape=(), dtype=float32) shape (32, 128):
    Weights bwd bias norm: tf.Tensor(11.3102255, shape=(), dtype=float32) shape (128,):

     ||dE/dw_forw|| = tf.Tensor(9.418793e-07, shape=(), dtype=float32) shape: (256, 128)
     ||dE/dw_forw_rec|| = tf.Tensor(3.8971484e-06, shape=(), dtype=float32) shape: (32, 128)
     ||dE/dw_forw_bias|| = tf.Tensor(1.0172046e-06, shape=(), dtype=float32) shape: (128,)
     ||dE/dw_bckw|| = tf.Tensor(9.837944e-07, shape=(), dtype=float32) shape: (256, 128)
     ||dE/dw_bkw_rec|| = tf.Tensor(4.134917e-06, shape=(), dtype=float32) shape: (32, 128)
     ||dE/dw_bkw_bias|| = tf.Tensor(1.0577168e-06, shape=(), dtype=float32) shape: (128,)
Layer bi_lstm
    Weights norm: 13.112313 shape: [256, 128]
    Weights norm: 5.691354 shape: [32, 128]
    Weights norm: 11.340048 shape: [128]
    Weights norm: 13.147353 shape: [256, 128]
    Weights norm: 5.685838 shape: [32, 128]
    Weights norm: 11.3102255 shape: [128]
Layer bi_lstm
    Grad norm: 0.0 shape: [256, 128]
0.00E+00
    Grad norm: 0.0 shape: [32, 128]
0.00E+00
    Grad norm: 0.0 shape: [128]
0.00E+00
    Grad norm: 0.0 shape: [256, 128]
0.00E+00
    Grad norm: 0.0 shape: [32, 128]
0.00E+00
    Grad norm: 0.0 shape: [128]
0.00E+0

我在这里想念什么？

提前致谢

python-3.x - Tensorflow gradientTape 在计算相同的梯度两次时给出不同的结果

0 回答 0

Related

Reference