python - 在 DNN 训练结束时返回 Hessian 逆矩阵和输入的偏导数

Question

使用 Keras 和 Tensorflow 作为后端，我构建了一个 DNN，它以恒星光谱作为输入（7213 个数据点）并输出三个恒星参数（温度、重力和金属度）。网络在我的测试集上训练得很好并且预测得很好，但是为了让结果在科学上有用，我需要能够估计我的错误。这样做的第一步是获得逆 Hessian 矩阵，这似乎仅使用 Keras 是不可能的。因此，我尝试使用 scipy 创建一个解决方法，使用 scipy.optimize.minimize 和 BFGS、L-BFGS-B 或 Netwon-CG 作为方法。其中任何一个都将返回 Hessian 逆矩阵。

这个想法是使用 Adam 优化器训练模型 100 个时期（或直到模型收敛），然后运行 BFGS 的一次迭代或函数（或其他函数之一）以返回我的模型的 Hessian 矩阵。

这是我的代码：

from scipy.optimize import minimize

import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam


# Define vars
activation = 'relu'
init = 'he_normal'
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-08

input_shape = (None,n)
n_hidden = [2048,1024,512,256,128,32]
output_dim = 3

epochs = 100
lr = 0.0008
batch_size = 64
decay = 0.00

# Design DNN Layers

model = Sequential([

    Dense(n_hidden[0], batch_input_shape=input_shape, init=init, activation=activation),

    Dense(n_hidden[1], init=init, activation=activation), 

    Dense(n_hidden[2], init=init, activation=activation),

    Dense(n_hidden[3], init=init, activation=activation),

    Dense(n_hidden[4], init=init, activation=activation),

    Dense(n_hidden[5], init=init, activation=activation),

    Dense(output_dim, init=init, activation='linear'),
])


# Optimization function
optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay)


# Compile and train network
model.compile(optimizer=optimizer, loss='mean_squared_error')

#train_X.shape = (50000,7213)
#train_Y.shape = (50000,3)
#cv_X.shape = (10000,7213)
#cv_Y.shape = (10000,3)

history = model.fit(train_X, train_Y, validation_data=(cv_X, cv_Y),
             nb_epoch=epochs, batch_size=batch_size, verbose=2)


weights = []
for layer in model.layers:
    weights.append(layer.get_weights())

def loss(W):
    weightsList = W
    weightsList = np.array(W)
    new_weights = []
    for i, layer in enumerate((weightsList)):
        new_weights.append(np.array(weightsList[i]))
    model.set_weights(np.array(new_weights))
    preds = model.predict(train_X)
    mse = np.sum(np.square(np.subtract(preds,train_Y)))/len(train_X[:,0])
    print(mse)
    return mse


x0=weights    
res = minimize(loss, x0, args=(), method = 'BFGS', options={'maxiter':1,'eps':1e-6,'disp':True})
#res = minimize(loss, x0, method='L-BFGS-B', options={'disp': True, 'maxls': 1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 1, 'ftol': 0.5, 'maxcor': 1, 'maxfun': 1})
#res = minimize(loss, x0, args=(), method='Newton-CG', jac=None, hess=None, hessp=None, tol=None, callback=None, options={'disp': False, 'xtol': 1e-05, 'eps': 1.4901161193847656e-08, 'return_all': False, 'maxiter': 1})
inv_hess = res['hess_inv']

1）模型训练得非常好，但是当尝试使用先前训练的权重运行 scipy 最小化器进行单次迭代时，我遇到了问题。

尝试方法时输出=BFGS：

0.458706819754
0.457811632697
0.458706716791
...
0.350124572422
0.350186770445
0.350125320636

ValueErrorTraceback (most recent call last)
---> 19 res = minimize(loss, x0, args=(), method = 'BFGS', tol=1, options={'maxiter':1,'eps':1e-6,'disp':True})#,'gtol':0.1}, tol=5)

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    442         return _minimize_cg(fun, x0, args, jac, callback, **options)
    443     elif meth == 'bfgs':
--> 444         return _minimize_bfgs(fun, x0, args, jac, callback, **options)

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_bfgs(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, **unknown_options)
    963         try:  # this was handled in numeric, let it remaines for more safety
--> 964             rhok = 1.0 / (numpy.dot(yk, sk))
    965         except ZeroDivisionError:
    966             rhok = 1000.0

ValueError: operands could not be broadcast together with shapes (7213,2048) (2048,1024)

尝试方法时的输出=L-BFGS-B：

ValueErrorTraceback (most recent call last)

---> 20 res = minimize(loss, x0, method='L-BFGS-B', options={'disp': True, 'maxls': 1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 1, 'ftol': 0.5, 'maxcor': 1, 'maxfun': 1})


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    448     elif meth == 'l-bfgs-b':
    449         return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 450                                 callback=callback, **options)


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/lbfgsb.pyc in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    300         raise ValueError('maxls must be positive.')
    301 
--> 302     x = array(x0, float64)
    303     f = array(0.0, float64)
    304     g = zeros((n,), float64)

ValueError: setting an array element with a sequence.

尝试方法时的输出=Newton-CG

ValueErrorTraceback (most recent call last)

---> 21 res = minimize(loss, x0, args=(), method='Newton-CG', jac=None, hess=None, hessp=None, tol=None, callback=None, options={'disp': False, 'xtol': 1e-05, 'eps': 1.4901161193847656e-08, 'return_all': False, 'maxiter': 1})


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    445     elif meth == 'newton-cg':
    446         return _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,
--> 447                                   **options)
    448     elif meth == 'l-bfgs-b':
    449         return _minimize_lbfgsb(fun, x0, args, jac, bounds,

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback, xtol, eps, maxiter, disp, return_all, **unknown_options)
   1438     _check_unknown_options(unknown_options)
   1439     if jac is None:
-> 1440         raise ValueError('Jacobian is required for Newton-CG method')

ValueError: Jacobian is required for Newton-CG method

2）下一个任务是获得模型输出相对于模型输入的导数。例如，对于一个恒星参数（输出之一），比如温度，我需要找到关于 7213 输入中的每一个的偏导数。然后对 3 个输出中的每一个执行相同的操作。

所以基本上，我的第一个任务（1）是找到一种方法来返回我的模型的逆 Hessian 矩阵，接下来（2）我需要找到一种方法来返回我的输出相对于我的输入的一阶偏导数.

有人对这两个任务中的任何一个有一些见解吗？谢谢。

编辑

我正在尝试使用 theano.gradient.jacobian() 来返回我的输出的雅可比矩阵 wrt 我的输入。我已经将我的模型变成了模型权重的函数，并将该函数用作 theano.gradient.jacobian() 中的第一个参数。当我尝试使用我的模型权重和输入数据的形式的多维数组运行梯度时，我的问题就出现了。

import theano.tensor as T

weights_in_model = T.dvector('model_weights')
x = T.dvector('x')

def pred(x,weights_in_model):
    weights = T.stack((weights_in_model[0],weights_in_model[1]), axis=0)
    x = T.shape_padright(x, n_ones=1)

    prediction=T.dot(x, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[2],weights_in_model[3]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[4],weights_in_model[5]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[6],weights_in_model[7]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[8],weights_in_model[9]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[10],weights_in_model[11]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)


    weights = T.stack((weights_in_model[12],weights_in_model[13]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    T.flatten(prediction)

    return prediction


f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)
h=theano.function([x,weights_in_model],f,allow_input_downcast=True)


x = train_X
weights_in_model = model.get_weights()
h(x,weights_in_model)

最后一行给出了错误：

TypeError: ('Bad input argument to theano function with name "<ipython-input-365-a1ab256aa220>:1"  at index 0(0-based)', 'Wrong number of dimensions: expected 1, got 2 with shape (2000, 7213).')

但是当我将输入更改为：

weights_in_model = T.matrix('model_weights')
x = T.matrix('x')

我从这条线上得到一个错误：

f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)

阅读：

AssertionError: tensor.jacobian expects a 1 dimensional variable as `expression`. If not use flatten to make it a vector

关于如何解决这个问题的任何想法？

score 3 · Accepted Answer

ANSWER FOUND!：此代码用于预测模型的一个输出值。目前我正在修改它以计算 3 个雅可比矩阵；每个输出一个。

import theano
import theano.tensor as T
import theano.typed_list
theano.config.optimizer='fast_compile'
theano.config.exception_verbosity='high'

# Declare function input placeholders
weights_in_model = theano.typed_list.TypedListType(theano.tensor.dmatrix)()
x = T.matrix('x')

# Define model function
def pred(x,weights_in_model): 
    weights = T.concatenate((weights_in_model[0],weights_in_model[1]), axis=0)
    x = T.concatenate((x, T.ones((T.shape(x)[0], 1))), axis=1)

    prediction = T.dot(x, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[2],weights_in_model[3]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[4],weights_in_model[5]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[6],weights_in_model[7]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[8],weights_in_model[9]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[10],weights_in_model[11]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)


    weights = T.concatenate((weights_in_model[12],weights_in_model[13]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.flatten(prediction)
    return prediction

# Create gradient function
f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)

# Compile function
h=theano.function([x,weights_in_model],f,allow_input_downcast=True)


# Get function inputs
weights_in_model_ = model.get_weights()
x_=train_data

# Reshape bias layers
weights_in_model_[1] = np.reshape(weights_in_model_[1], (1, 2048))
weights_in_model_[3] = np.reshape(weights_in_model_[3], (1, 1024))
weights_in_model_[5] = np.reshape(weights_in_model_[5], (1, 512))
weights_in_model_[7] = np.reshape(weights_in_model_[7], (1, 256))
weights_in_model_[9] = np.reshape(weights_in_model_[9], (1, 128))
weights_in_model_[11] = np.reshape(weights_in_model_[11], (1, 32))
weights_in_model_[13] = np.reshape(weights_in_model_[13], (1, 1))

# Compute Jacobian (returns format with a bunch of zero rows)
jacs = h(x_, weights_in_model_)

# Put Jacobian matrix in proper format (ie. shape = (number_of_input_examples, number_of_input_features)

jacobian_matrix = np.zeros((jacs.shape[0],jacs.shape[2]))
for i, jac in enumerate(jacs): 
    jacobian_matrix[i] = jac[i]

下一个任务是找到模型权重的输出的 Hessian 矩阵！

python - 在 DNN 训练结束时返回 Hessian 逆矩阵和输入的偏导数

1 回答 1

Related

Reference