tensorflow - 与 tf.gradients() 相比，用于计算雅可比的 tf.GradientTape 性能糟糕透顶

Question

下面的解决方案：

设想：

我试图在一个循环中多次计算用户定义函数的雅可比。我可以使用 TF 2 的 GradientTape 以及旧的基于会话的 tf.gradients() 方法来做到这一点。问题是 GradientTape 比 tf.gradients() 慢得多（慢 100 倍）。它具有我想使用的功能（bath_jacobian、hessian 支持等），但如果它慢 100 倍，那么我就无法使用它。

问题：

我不清楚我是否只是在滥用 GradientTape，或者它是否总是会变慢，因为它每次调用时都必须重新区分提供的函数（我的怀疑）。我正在寻求解决我使用 GradientTape 的提示，或者确认它从根本上总是比 tf.gradients 慢几个数量级。

相关问题：

重复使用 GradientTape 进行多次雅可比计算- 相同的场景，未回答
`GradientTape` 是否需要重新区分导数的每个评估？- 相同的场景，没有答案
使用具有全局上下文的 GradientTape - 松散相关，无法将该解决方案应用于我的场景

完全包含比较 GradientTape 和 tf.gradients() 的最小示例：

import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
# from tensorflow.python.ops.parallel_for.gradients import jacobian, batch_jacobian
import timeit


class FunctionCaller(object):
    def __init__(self, func, nX, dtype=tf.float64, useSessions=True):

        if useSessions:
            disable_eager_execution()

        self.func = func
        self.nX = nX
        self.useSessions = useSessions
        self.dtype = dtype
        self.sess = tf.compat.v1.Session() if useSessions else None

        if not useSessions:
            return

        #
        # we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
        #
        xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])

        # add function to graph and guarantee its output shape
        func_tensor = tf.reshape(func(xTensor), [-1, nX])

        # take the gradient for each output, one at a time, and stack the results back together
        each_output = tf.unstack(func_tensor, nX, axis=1)

        jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
                          for output in each_output], axis=1)

        # record these tensors so we can use them later with session.run()
        self.xTensor = xTensor
        self.func_tensor = func_tensor
        self.jac_func_tensor = jac_x

    def jac(self, x_i):
        if self.useSessions:
            return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
        else:
            return self._useGradientTape(x_i)

    # THIS FUNCTION IS SUPER INEFFICIENT.
    def _useGradientTape(self, x_i):
        with tf.GradientTape(persistent=True) as g:
            xTensor = tf.Variable(x_i, dtype=self.dtype)  # is this my problem??? i recreate x every time?
            y = tf.reshape(self.func(xTensor), [-1, self.nX])
        jac_x_at_i = g.batch_jacobian(y, xTensor)
        # del g
        return jac_x_at_i.numpy()

    def __del__(self):
        if self.sess is not None:
            self.sess.close()


def main():
    @tf.function
    def Xdot(x_i):
        x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
        return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)

    nT = 20
    nX = 3

    # create some trash data
    x_i = np.arange(nT*nX).reshape([-1, nX])

    nTrials = 100

    # try the eager version first
    caller_eager = FunctionCaller(Xdot, nX, useSessions=False)
    start_time = timeit.default_timer()
    for _ in range(nTrials):
        jac_eager = caller_eager.jac(x_i)
    elapsed = timeit.default_timer() - start_time
    print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))

    # now try the sessions version
    caller_sessions = FunctionCaller(Xdot, nX, useSessions=True)
    start_time = timeit.default_timer()
    caller_sessions.jac(x_i)  # call it once to do its graph building stuff?
    for _ in range(nTrials):
        jac_session = caller_sessions.jac(x_i)
    elapsed = timeit.default_timer() - start_time
    print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))

    residual = np.max(np.abs(jac_eager - jac_session))
    print('residual between eager and session trials is {}'.format(residual))

if __name__ == "__main__":
    main()

编辑 - 解决方案：

xdurch0 在下面指出，我应该将 _useGradientTape() 包装在 @tf.function 中 - 由于其他原因，我之前没有成功。一旦我这样做了，我必须将 xTensor 的定义移到 @tf.function 包装器之外，方法是使其成为成员变量并使用 tf.assign()。

完成所有这些后，我发现 GradientTape（对于这个简单的示例）现在与 tf.gradints 处于同一数量级。当运行足够多的试验（~1E5）时，它的速度是 tf.gradients 的两倍。惊人的！

import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
import timeit


class FunctionCaller(object):
    def __init__(self, func, nT, nX, dtype=tf.float64, useSessions=True):

        if useSessions:
            disable_eager_execution()

        self.func = func
        self.nX = nX
        self.useSessions = useSessions
        self.dtype = dtype
        self.sess = tf.compat.v1.Session() if useSessions else None

        if not useSessions:
            #  you should be able to create without an initial value, but tf is demanding one
            #  despite what the docs say. bug?
            #  tf.Variable(initial_value=None, shape=[None, nX], validate_shape=False, dtype=self.dtype)
            self.xTensor = tf.Variable([[0]*nX]*nT, dtype=self.dtype)  # x needs to be properly sized once
            return

        #
        # we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
        #
        xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])

        # add function to graph and guarantee its output shape
        func_tensor = tf.reshape(func(xTensor), [-1, nX])

        # take the gradient for each output, one at a time, and stack the results back together
        each_output = tf.unstack(func_tensor, nX, axis=1)

        jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
                          for output in each_output], axis=1)

        # record these tensors so we can use them later with session.run()
        self.xTensor = xTensor
        self.func_tensor = func_tensor
        self.jac_func_tensor = jac_x

    def jac(self, x_i):
        if self.useSessions:
            return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
        else:
            return self._useGradientTape(x_i).numpy()

    @tf.function  # THIS IS CRUCIAL
    def _useGradientTape(self, x_i):
        with tf.GradientTape(persistent=True) as g:
            self.xTensor.assign(x_i)  # you need to create the variable once outside the graph
            y = tf.reshape(self.func(self.xTensor), [-1, self.nX])
        jac_x_at_i = g.batch_jacobian(y, self.xTensor)
        # del g
        return jac_x_at_i

    def __del__(self):
        if self.sess is not None:
            self.sess.close()


def main():
    @tf.function
    def Xdot(x_i):
        x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
        return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)

    nT = 20
    nX = 3

    # create some trash data
    x_i = np.random.random([nT, nX])

    nTrials = 1000  # i find that nTrials<=1E3, eager is slower, it's faster for >=1E4, it's TWICE as fast for >=1E5

    # try the eager version first
    caller_eager = FunctionCaller(Xdot, nT, nX, useSessions=False)
    start_time = timeit.default_timer()
    for _ in range(nTrials):
        jac_eager = caller_eager.jac(x_i)
    elapsed = timeit.default_timer() - start_time
    print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))

    # now try the sessions version
    caller_sessions = FunctionCaller(Xdot, nT, nX, useSessions=True)
    start_time = timeit.default_timer()
    for _ in range(nTrials):
        jac_session = caller_sessions.jac(x_i)
    elapsed = timeit.default_timer() - start_time
    print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))

    residual = np.max(np.abs(jac_eager - jac_session))
    print('residual between eager and session trials is {}'.format(residual))

if __name__ == "__main__":
    main()

tensorflow - 与 tf.gradients() 相比，用于计算雅可比的 tf.GradientTape 性能糟糕透顶

编辑 - 解决方案：

0 回答 0

Related

Reference