python - TF2.5：张量流：当最小化损失时，变量不存在梯度。（渐变胶带，keras）

Question

我正在努力建立一个深度生存模型，并从一个简单的比例风险模型开始，使用 keras 和非 keras tensorflow 的混合。我的问题是GradientTape似乎没有跟踪我使用定义的参数tf.Variable，只有 Keras 中的参数。

这是我的模型：

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D

class phmodel(tf.Module):
    def __init__(self, lr = .0001, timesteps = 100, **kwargs):
        super().__init__(**kwargs)
        self.beta = tf.Variable(0.0) # initialize to 0 = log(1)
        self.timesteps = timesteps
        self.lr = lr
        self.makemodel()
        self.opt = tf.keras.optimizers.SGD(learning_rate = lr)

    def makemodel(self):
        inp = Input((self.timesteps,1,))
        lay = Conv1D(filters = 1, kernel_size = 1, use_bias = False)(inp) # this is equivalent to XB -- it applies the same weight to each of the 100 timesteps
        elay = tf.math.exp(lay)
        beta = tf.math.exp(self.beta) #re-exponentiate:  it's trained in logs.  This line is to make the code less confusing
        baseline_hazard = tf.expand_dims(1-tf.math.exp(-beta*tf.range(0, self.timesteps, delta=1, dtype='float32')), -1)
        cumulative_hazard = tf.math.cumsum(baseline_hazard*elay, axis = 1)
        out = 1-tf.math.exp(-cumulative_hazard)
        self.model = Model(inp, out)


    def __call__(self, x):
        return self.model(x)
       
    def train_step(self, x, y, w = None, verbose = False):
        with tf.GradientTape() as g:
            yhat = self.model(x)
            loss = tf.keras.losses.BinaryCrossentropy()(y, yhat, sample_weight = w)
            grad = g.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grad, self.model.trainable_variables))
        print(loss) if verbose is True else None

看起来像这样：

model = phmodel(100)
model.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_4 (InputLayer)         [(None, 100, 1)]          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 1)            1         
_________________________________________________________________
tf.math.exp_5 (TFOpLambda)   (None, 100, 1)            0         
_________________________________________________________________
tf.math.multiply_2 (TFOpLamb (None, 100, 1)            0         
_________________________________________________________________
tf.math.cumsum_2 (TFOpLambda (None, 100, 1)            0         
_________________________________________________________________
tf.math.negative_2 (TFOpLamb (None, 100, 1)            0         
_________________________________________________________________
tf.math.exp_6 (TFOpLambda)   (None, 100, 1)            0         
_________________________________________________________________
tf.math.subtract_2 (TFOpLamb (None, 100, 1)            0         
=================================================================
Total params: 1
Trainable params: 1
Non-trainable params: 0
_________________________________________________________________

定义一些虚拟数据：

xtr = np.random.normal(size = 100).reshape(1,100,1)
ytr = np.concatenate([np.zeros(50), np.ones(50)]).reshape(1,100,1)

前传：

yhat = model(xtr)

看看可训练的变量：

model.trainable_variables
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>,
 <tf.Variable 'conv1d_4/kernel:0' shape=(1, 1, 1) dtype=float32, numpy=array([[[1.2811402]]], dtype=float32)>)

损失似乎是有序的：

loss = tf.keras.losses.BinaryCrossentropy()(ytr, yhat, sample_weight = None)
loss
<tf.Tensor: shape=(), dtype=float32, numpy=6.711515>

计算梯度：

with tf.GradientTape() as g:
    yhat = model(xtr)
    loss = tf.keras.losses.BinaryCrossentropy()(ytr, yhat, sample_weight = None)
    grad = g.gradient(loss, model.trainable_variables)
grad
(None,
 <tf.Tensor: shape=(1, 1, 1), dtype=float32, numpy=array([[[0.51390326]]], dtype=float32)>)

这就是问题所在。 没有渐变Variable:0。是什么赋予了？当我去应用渐变时，我会收到警告、问题和NaNs

model.opt.apply_gradients(zip(grad, model.trainable_variables))
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.

如果我运行训练循环超过几个步骤，conv 层上的梯度开始为NaN.

(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-6.2>,
 <tf.Variable 'conv1d_12/kernel:0' shape=(1, 1, 1) dtype=float32, numpy=array([[[nan]]], dtype=float32)>)

是什么赋予了？

编辑：我设法通过从方法中取出一堆东西makemodel并将其放入__call__. 我不知道为什么会这样。也许它会帮助一些未来的谷歌人？我很乐意给任何能解释为什么这样有效的人打 15 分，因为我不知道。

class phmodel(tf.Module):
    def __init__(self, lr = .0001, timesteps = 100, **kwargs):
        super().__init__(**kwargs)
        self.beta = tf.Variable(-10.2) # initialize to -3.2 = log(.04)
        self.timesteps = timesteps
        self.timerange = tf.range(0, self.timesteps, delta=1, dtype='float32')
        self.lr = lr
        self.makemodel()
        self.opt = tf.keras.optimizers.SGD(learning_rate = lr)

       
    def makemodel(self):
        inp = Input((self.timesteps,1,))
        lay = Conv1D(filters = 1, kernel_size = 1, use_bias = False)(inp) # this is equivalent to XB -- it applies the same weight to each of the 100 timesteps
        self.model = Model(inp, lay)


    def __call__(self, x):
        mout = self.model(x)
        elay = tf.math.exp(mout)
        beta = tf.math.exp(self.beta) #re-exponentiate:  it's trained in logs.  This line is to make the code less confusing
        baseline_hazard = tf.expand_dims(1-tf.math.exp(tf.multiply(-beta,self.timerange)), -1)
        cumulative_hazard = tf.math.cumsum(tf.multiply(baseline_hazard,elay), axis = 1)
        out = 1-tf.math.exp(-cumulative_hazard)
        return out
       
    def train_step(self, x, y, w = None, verbose = False):
        with tf.GradientTape() as g:
            yhat = self.model(x)
            loss = tf.keras.losses.BinaryCrossentropy()(y, yhat, sample_weight = w)
            grad = g.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grad, self.model.trainable_variables))
        print(loss) if verbose is True else None

python - TF2.5：张量流：当最小化损失时，变量不存在梯度。（渐变胶带，keras）

0 回答 0

Related

Reference