我正在尝试使用 TF 2.0。我想记录我的神经网络的梯度和权重范数。为此,我使用以下代码。
def get_weights_norm(layer, optim_iters, log=False):
"""
Calculate norm of layer's weights and save it as tf.summary
if log = true it also print it
"""
w_l = layer.trainable_weights
name = layer.name
if log:
print("Layer " + name)
for w in w_l:
shape = str(w.shape.as_list())
norm = tf.norm(w.numpy(), name="norm").numpy()
s_name = name + "_layer_norm/ shape-" + shape
tf.summary.scalar(s_name, norm, step=optim_iters)
if log:
print("\tWeights norm: %s shape: %s" % (norm, shape))
def get_grad_norm(g_tape, loss_value, layer, optim_iters, log=False):
"""
Calculate norm of gradients of the loss respect to layer weights weights and save it as tf.summary
if log = true it also print it
"""
grad = g_tape.gradient(loss_value, layer.trainable_weights)
name = layer.name
if log:
print("Layer " + name)
for w in grad:
shape = str(w.shape.as_list())
norm = tf.norm(w.numpy(), name="norm").numpy()
s_name = name + "_layer_grad_norm/ shape-" + shape
tf.summary.scalar(s_name, norm, step=optim_iters)
if log:
print("\tGrad norm: %s shape: %s" % (norm, shape))
print("{:.2E}".format(norm))
这是训练循环:
for epoch in range(epochs):
print('Start of epoch %d' % (epoch,))
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
# Open a GradientTape to record the operations run
# during the forward pass, which enables autodifferentiation.
with tf.GradientTape(persistent=True) as tape:
# Run the forward pass of the layer.
logits = model(x_batch_train) # Logits for this minibatch
# Compute the loss value for this minibatch.
loss_value = loss_fn(y_batch_train, logits)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
g_bidi = tape.gradient(loss_value, model.get_layer("bi_lstm").trainable_weights)
g_out = tape.gradient(loss_value, model.get_layer("output").trainable_weights)
g_dense = tape.gradient(loss_value, model.get_layer("dense").trainable_weights)
print("Out Layer")
w_out = model.get_layer("output").trainable_weights
print(model.get_layer("output").name)
print(float(tf.norm(w_out[0].numpy(), name="norm")))
print("\tWeights norm: %s shape: %s" % (tf.norm(w_out[0].numpy(), name="norm"), w_out[0].shape))
print("\tWeights norm: %s shape: %s" % (tf.norm(w_out[1].numpy(), name="norm"), w_out[1].shape))
print()
print("\t ||dE/dw_out|| = %s shape: %s" % (tf.norm(g_out[0].numpy(), name='norm'), g_out[0].shape))
print("\t ||dE/db_out|| = %s shape: %s" % (tf.norm(g_out[1].numpy(), name='norm'), g_out[1].shape))
get_weights_norm(model.get_layer("output"), optimizer.iterations, True)
get_grad_norm(tape, loss_value, model.get_layer("output"), optimizer.iterations, True)
print()
print()
print("Bidirect")
w_bid = model.get_layer("bi_lstm").trainable_weights
print("\tWeights fwd norm: %s shape %s:" % (tf.norm(w_bid[0].numpy(), name="norm"), w_bid[0].shape))
print("\tWeights fwd_rec norm: %s shape %s:" % (tf.norm(w_bid[1].numpy(), name="norm"), w_bid[1].shape))
print("\tWeights fwd bias norm: %s shape %s:" % (tf.norm(w_bid[2].numpy(), name="norm"), w_bid[2].shape))
print("\tWeights bwd norm: %s shape %s:" % (tf.norm(w_bid[3].numpy(), name="norm"), w_bid[3].shape))
print("\tWeights bwd_rec norm: %s shape %s:" % (tf.norm(w_bid[4].numpy(), name="norm"), w_bid[4].shape))
print("\tWeights bwd bias norm: %s shape %s:" % (tf.norm(w_bid[5].numpy(), name="norm"), w_bid[5].shape))
print()
print("\t ||dE/dw_forw|| = %s shape: %s" % (tf.norm(g_bidi[0].numpy(), name='norm'), g_bidi[0].shape))
print("\t ||dE/dw_forw_rec|| = %s shape: %s" % (tf.norm(g_bidi[1].numpy(), name='norm'), g_bidi[1].shape))
print("\t ||dE/dw_forw_bias|| = %s shape: %s" % (tf.norm(g_bidi[2].numpy(), name='norm'), g_bidi[2].shape))
print("\t ||dE/dw_bckw|| = %s shape: %s" % (tf.norm(g_bidi[3].numpy(), name='norm'), g_bidi[3].shape))
print("\t ||dE/dw_bkw_rec|| = %s shape: %s" % (tf.norm(g_bidi[4].numpy(), name='norm'), g_bidi[4].shape))
print("\t ||dE/dw_bkw_bias|| = %s shape: %s" % (tf.norm(g_bidi[5].numpy(), name='norm'), g_bidi[5].shape))
get_weights_norm(model.get_layer("bi_lstm"), optimizer.iterations, True)
get_grad_norm(tape, loss_value, model.get_layer("bi_lstm"), optimizer.iterations, True)
问题: 运行脚本时,输出层的梯度范数得到相同的值,但双向层的范数(bi_lstm)得到不同的值
这是输出:
Weights norm: tf.Tensor(0.33847392, shape=(), dtype=float32) shape: (64, 1)
Weights norm: tf.Tensor(88.14, shape=(), dtype=float32) shape: (1,)
||dE/dw_out|| = tf.Tensor(1.7349662, shape=(), dtype=float32) shape: (64, 1)
||dE/db_out|| = tf.Tensor(0.31759995, shape=(), dtype=float32) shape: (1,)
Layer output
Weights norm: 0.33847392 shape: [64, 1]
Weights norm: 88.14 shape: [1]
Bidirect
Weights fwd norm: tf.Tensor(13.112313, shape=(), dtype=float32) shape (256, 128):
Weights fwd_rec norm: tf.Tensor(5.691354, shape=(), dtype=float32) shape (32, 128):
Weights fwd bias norm: tf.Tensor(11.340048, shape=(), dtype=float32) shape (128,):
Weights bwd norm: tf.Tensor(13.147353, shape=(), dtype=float32) shape (256, 128):
Weights bwd_rec norm: tf.Tensor(5.685838, shape=(), dtype=float32) shape (32, 128):
Weights bwd bias norm: tf.Tensor(11.3102255, shape=(), dtype=float32) shape (128,):
||dE/dw_forw|| = tf.Tensor(9.418793e-07, shape=(), dtype=float32) shape: (256, 128)
||dE/dw_forw_rec|| = tf.Tensor(3.8971484e-06, shape=(), dtype=float32) shape: (32, 128)
||dE/dw_forw_bias|| = tf.Tensor(1.0172046e-06, shape=(), dtype=float32) shape: (128,)
||dE/dw_bckw|| = tf.Tensor(9.837944e-07, shape=(), dtype=float32) shape: (256, 128)
||dE/dw_bkw_rec|| = tf.Tensor(4.134917e-06, shape=(), dtype=float32) shape: (32, 128)
||dE/dw_bkw_bias|| = tf.Tensor(1.0577168e-06, shape=(), dtype=float32) shape: (128,)
Layer bi_lstm
Weights norm: 13.112313 shape: [256, 128]
Weights norm: 5.691354 shape: [32, 128]
Weights norm: 11.340048 shape: [128]
Weights norm: 13.147353 shape: [256, 128]
Weights norm: 5.685838 shape: [32, 128]
Weights norm: 11.3102255 shape: [128]
Layer bi_lstm
Grad norm: 0.0 shape: [256, 128]
0.00E+00
Grad norm: 0.0 shape: [32, 128]
0.00E+00
Grad norm: 0.0 shape: [128]
0.00E+00
Grad norm: 0.0 shape: [256, 128]
0.00E+00
Grad norm: 0.0 shape: [32, 128]
0.00E+00
Grad norm: 0.0 shape: [128]
0.00E+0
我在这里想念什么?
提前致谢