我发现这是我需要的解决方案:
def compute_loss(images: tf.Tensor, texts: tf.Tensor, margin: float) -> tf.Tensor:
with tf.variable_scope(name_or_scope="loss"):
scores = tf.matmul(images, texts, transpose_b=True)
diagonal = tf.diag_part(scores)
# Compare every diagonal score to scores in its column i.e
# All contrastive images for each sentence
cost_s = tf.maximum(0.0, margin - tf.reshape(diagonal, [-1, 1]) + scores)
# Compare every diagonal score to scores in its row i.e
# All contrastive sentences for each image
cost_im = tf.maximum(0.0, margin - diagonal + scores)
# Clear diagonals
cost_s = tf.linalg.set_diag(cost_s, tf.zeros(tf.shape(cost_s)[0]))
cost_im = tf.linalg.set_diag(cost_im, tf.zeros(tf.shape(cost_im)[0]))
# For each positive pair (i,s) sum over the negative images
cost_s = tf.reduce_sum(cost_s, axis=1)
# For each positive pair (i,s) sum over the negative texts
cost_im = tf.reduce_sum(cost_im, axis=0)
triplet_loss = tf.reduce_mean(cost_s) + tf.reduce_mean(cost_im)
return triplet_loss