我正在努力建立一个深度生存模型,并从一个简单的比例风险模型开始,使用 keras 和非 keras tensorflow 的混合。我的问题是GradientTape
似乎没有跟踪我使用定义的参数tf.Variable
,只有 Keras 中的参数。
这是我的模型:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D
class phmodel(tf.Module):
def __init__(self, lr = .0001, timesteps = 100, **kwargs):
super().__init__(**kwargs)
self.beta = tf.Variable(0.0) # initialize to 0 = log(1)
self.timesteps = timesteps
self.lr = lr
self.makemodel()
self.opt = tf.keras.optimizers.SGD(learning_rate = lr)
def makemodel(self):
inp = Input((self.timesteps,1,))
lay = Conv1D(filters = 1, kernel_size = 1, use_bias = False)(inp) # this is equivalent to XB -- it applies the same weight to each of the 100 timesteps
elay = tf.math.exp(lay)
beta = tf.math.exp(self.beta) #re-exponentiate: it's trained in logs. This line is to make the code less confusing
baseline_hazard = tf.expand_dims(1-tf.math.exp(-beta*tf.range(0, self.timesteps, delta=1, dtype='float32')), -1)
cumulative_hazard = tf.math.cumsum(baseline_hazard*elay, axis = 1)
out = 1-tf.math.exp(-cumulative_hazard)
self.model = Model(inp, out)
def __call__(self, x):
return self.model(x)
def train_step(self, x, y, w = None, verbose = False):
with tf.GradientTape() as g:
yhat = self.model(x)
loss = tf.keras.losses.BinaryCrossentropy()(y, yhat, sample_weight = w)
grad = g.gradient(loss, self.model.trainable_variables)
self.opt.apply_gradients(zip(grad, self.model.trainable_variables))
print(loss) if verbose is True else None
看起来像这样:
model = phmodel(100)
model.model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_4 (InputLayer) [(None, 100, 1)] 0
_________________________________________________________________
conv1d_3 (Conv1D) (None, 100, 1) 1
_________________________________________________________________
tf.math.exp_5 (TFOpLambda) (None, 100, 1) 0
_________________________________________________________________
tf.math.multiply_2 (TFOpLamb (None, 100, 1) 0
_________________________________________________________________
tf.math.cumsum_2 (TFOpLambda (None, 100, 1) 0
_________________________________________________________________
tf.math.negative_2 (TFOpLamb (None, 100, 1) 0
_________________________________________________________________
tf.math.exp_6 (TFOpLambda) (None, 100, 1) 0
_________________________________________________________________
tf.math.subtract_2 (TFOpLamb (None, 100, 1) 0
=================================================================
Total params: 1
Trainable params: 1
Non-trainable params: 0
_________________________________________________________________
定义一些虚拟数据:
xtr = np.random.normal(size = 100).reshape(1,100,1)
ytr = np.concatenate([np.zeros(50), np.ones(50)]).reshape(1,100,1)
前传:
yhat = model(xtr)
看看可训练的变量:
model.trainable_variables
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>,
<tf.Variable 'conv1d_4/kernel:0' shape=(1, 1, 1) dtype=float32, numpy=array([[[1.2811402]]], dtype=float32)>)
损失似乎是有序的:
loss = tf.keras.losses.BinaryCrossentropy()(ytr, yhat, sample_weight = None)
loss
<tf.Tensor: shape=(), dtype=float32, numpy=6.711515>
计算梯度:
with tf.GradientTape() as g:
yhat = model(xtr)
loss = tf.keras.losses.BinaryCrossentropy()(ytr, yhat, sample_weight = None)
grad = g.gradient(loss, model.trainable_variables)
grad
(None,
<tf.Tensor: shape=(1, 1, 1), dtype=float32, numpy=array([[[0.51390326]]], dtype=float32)>)
这就是问题所在。 没有渐变Variable:0
。是什么赋予了?当我去应用渐变时,我会收到警告、问题和NaN
s
model.opt.apply_gradients(zip(grad, model.trainable_variables))
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
如果我运行训练循环超过几个步骤,conv 层上的梯度开始为NaN
.
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-6.2>,
<tf.Variable 'conv1d_12/kernel:0' shape=(1, 1, 1) dtype=float32, numpy=array([[[nan]]], dtype=float32)>)
是什么赋予了?
编辑:我设法通过从方法中取出一堆东西makemodel
并将其放入__call__
. 我不知道为什么会这样。也许它会帮助一些未来的谷歌人?我很乐意给任何能解释为什么这样有效的人打 15 分,因为我不知道。
class phmodel(tf.Module):
def __init__(self, lr = .0001, timesteps = 100, **kwargs):
super().__init__(**kwargs)
self.beta = tf.Variable(-10.2) # initialize to -3.2 = log(.04)
self.timesteps = timesteps
self.timerange = tf.range(0, self.timesteps, delta=1, dtype='float32')
self.lr = lr
self.makemodel()
self.opt = tf.keras.optimizers.SGD(learning_rate = lr)
def makemodel(self):
inp = Input((self.timesteps,1,))
lay = Conv1D(filters = 1, kernel_size = 1, use_bias = False)(inp) # this is equivalent to XB -- it applies the same weight to each of the 100 timesteps
self.model = Model(inp, lay)
def __call__(self, x):
mout = self.model(x)
elay = tf.math.exp(mout)
beta = tf.math.exp(self.beta) #re-exponentiate: it's trained in logs. This line is to make the code less confusing
baseline_hazard = tf.expand_dims(1-tf.math.exp(tf.multiply(-beta,self.timerange)), -1)
cumulative_hazard = tf.math.cumsum(tf.multiply(baseline_hazard,elay), axis = 1)
out = 1-tf.math.exp(-cumulative_hazard)
return out
def train_step(self, x, y, w = None, verbose = False):
with tf.GradientTape() as g:
yhat = self.model(x)
loss = tf.keras.losses.BinaryCrossentropy()(y, yhat, sample_weight = w)
grad = g.gradient(loss, self.model.trainable_variables)
self.opt.apply_gradients(zip(grad, self.model.trainable_variables))
print(loss) if verbose is True else None