python - Keras 中共享 LSTM 层中的状态持久性

Question

我正在尝试在 Keras 模型中使用具有状态的共享 LSTM 层，但似乎每次并行使用都会修改内部状态。这提出了两个问题：

在使用共享 LSTM 层训练模型并使用stateful=True时，并行使用是否也在训练期间更新相同的状态？
如果我的观察是有效的，有没有办法使用权重共享 LSTM，以便为每个并行使用独立存储状态？

下面的代码举例说明了三个序列共享 LSTM 的问题。将完整输入的预测与将预测输入分成两半并连续输入网络的结果进行比较。

可以观察到，a1与的前半部分相同aFull，这意味着在第一次预测期间，LSTM 的使用确实与独立状态并行。即，z1不受并行调用创建z2和z3. 但a2与后半部分不同aFull，因此并行使用的状态之间存在一定的交互作用。

我希望这两个部分的连接a1与a2使用更长的输入序列调用预测的结果相同，但情况似乎并非如此。另一个问题是，当这种交互发生在预测中时，它是否也在训练期间发生。

import keras
import keras.backend as K
import numpy as np

nOut = 3
xShape = (3, 50, 4)
inShape = (xShape[0], None, xShape[2])   
batchInShape = (1, ) + inShape
x = np.random.randn(*xShape)

# construct network
xIn = keras.layers.Input(shape=inShape, batch_shape=batchInShape)

# shared LSTM layer
sharedLSTM = keras.layers.LSTM(units=nOut, stateful=True, return_sequences=True, return_state=False)

# split the input on the first axis
x1 = keras.layers.Lambda(lambda x: x[:,0,:,:])(xIn)
x2 = keras.layers.Lambda(lambda x: x[:,1,:,:])(xIn)
x3 = keras.layers.Lambda(lambda x: x[:,2,:,:])(xIn)

# pass each input through the LSTM
z1 = sharedLSTM(x1)
z2 = sharedLSTM(x2)
z3 = sharedLSTM(x3)

# add a singleton dimension
y1 = keras.layers.Lambda(lambda x: K.expand_dims(x, axis=1))(z1)
y2 = keras.layers.Lambda(lambda x: K.expand_dims(x, axis=1))(z2)
y3 = keras.layers.Lambda(lambda x: K.expand_dims(x, axis=1))(z3)

# combine the outputs
y = keras.layers.Concatenate(axis=1)([y1, y2, y3])

model = keras.models.Model(inputs=xIn, outputs=y)
model.compile(loss='mse', optimizer='adam')
model.summary()

# no need to train, since we're interested only what is happening mechanically

# reset to a known state and predict for full input
model.reset_states()
aFull = model.predict(x[np.newaxis,:,:,:])

# reset to a known state and predict for the same input, but in two pieces
model.reset_states()
a1 = model.predict(x[np.newaxis,:,:xShape[1]//2,:])
a2 = model.predict(x[np.newaxis,:,xShape[1]//2:,:])
# combine the pieces
aSplit = np.concatenate((a1, a2), axis=2)

print('full diff: {}, first half diff: {}, second half diff: {}'.format(str(np.sum(np.abs(aFull - aSplit))), str(np.sum(np.abs(aFull[:,:,:xShape[1]//2,:] - aSplit[:,:,:xShape[1]//2,:]))), str(np.sum(np.abs(aFull[:,:,xShape[1]//2:,:] - aSplit[:,:,xShape[1]//2:,:])))))

更新：使用 Tensorflow 1.14 和 1.15 作为后端的 Keras 观察到上述行为。使用 tf2.0（使用调整后的导入）运行相同的代码会更改结果，因此a1不再与aFull. 这仍然可以通过设置stateful=False层实例化来完成。

这向我表明，我尝试使用具有共享参数的递归层，但自己的状态用于并行使用的方式，实际上是不可能的。

更新 2：似乎其他早先也错过了相同的功能：在 Keras 的 github 上已关闭、未回答的问题。

作为比较，这里是 pytorch 中的一个涂鸦（我第一次尝试使用它）实现一个简单的网络，其中 N 个并行 LSTM 共享权重，但具有独立的状态。在这种情况下，状态显式存储在列表中并手动提供给 LSTM 单元。

import torch
import numpy as np

class sharedLSTM(torch.nn.Module):

    def __init__(self, batchSz, nBands, nDims, outDim):
        super(sharedLSTM, self).__init__()
        self.internalLSTM = torch.nn.LSTM(input_size=nDims, hidden_size=outDim, num_layers=1, bias=True, batch_first=True)
        allStates = list()
        for bandIdx in range(nBands):
            h_0 = torch.zeros(1, batchSz, outDim)
            c_0 = torch.zeros(1, batchSz, outDim)
            allStates.append((h_0, c_0))

        self.allStates = allStates            
        self.nBands = nBands

    def forward(self, x):
        allOut = list()
        for dimIdx in range(self.nBands):
            thisSlice = x[:,dimIdx,:,:] # (batchSz, nSteps, nFeats)
            thisState = self.allStates[dimIdx]

            thisY, thisState = self.internalLSTM(thisSlice, thisState) 
            self.allStates[dimIdx] = thisState
            allOut.append(thisY[:,None,:,:]) # => (batchSz, 1, nSteps, nFeats)

        y = torch.cat(allOut, dim=1) # => (batchSz, nDims, nSteps, nFeats)

        return y

    def resetStates(self):
        for bandIdx in range(nBands):
            self.allStates[bandIdx][0][:] = 0.0
            self.allStates[bandIdx][1][:] = 0.0


batchSz = 5
nBands = 3
nFeats = 4
nOutDims = 2
net = sharedLSTM(batchSz, nBands, nFeats, nOutDims)
net = net.float()
print(net)

N = 20
x = torch.from_numpy(np.random.rand(batchSz, nBands, N, nFeats)).float()
x1 = x[:, :, :N//2, :]
x2 = x[:, :, N//2:, :]

aa = net.forward(x)
net.resetStates()
a1 = net.forward(x1)
a2 = net.forward(x2)

print('(with reset) first half abs diff: {}'.format(str(torch.sum(torch.abs(a1 - aa[:,:,:N//2,:])).detach().numpy())))
print('(with reset) second half abs diff: {}'.format(str(torch.sum(torch.abs(a2 - aa[:,:,N//2:,:])).detach().numpy())))

结果：无论我们是一次性还是分段进行预测，输出都是相同的。

我尝试使用子类在 Keras 中复制它，但没有成功：

import keras
import numpy as np

class sharedLSTM(keras.Model):
    def __init__(self, batchSz, nBands, nDims, outDim):
        super(sharedLSTM, self).__init__()
        self.internalLSTM = keras.layers.LSTM(units=outDim, stateful=True, return_sequences=True, return_state=True)
        self.internalLSTM.build((batchSz, None, nDims))
        self.internalLSTM.reset_states()
        allStates = list()
        allSlicers = list()
        for bandIdx in range(nBands):
            allStates.append(None)
            allSlicers.append(keras.layers.Lambda(lambda x, b: x[:, :, b, :], arguments = {'b' : bandIdx}))

        self.allStates = allStates            
        self.allSlicers = allSlicers
        self.Concat = keras.layers.Lambda(lambda x: keras.backend.concatenate(x, axis=2))

        self.nBands = nBands

    def call(self, x):
        allOut = list()
        for bandIdx in range(self.nBands):
            thisSlice = self.allSlicers[bandIdx]( x )
            thisState = self.allStates[bandIdx]

            thisY, *thisState = self.internalLSTM(thisSlice, initial_state=thisState) 
            self.allStates[bandIdx] = thisState.copy()
            allOut.append(thisY[:,:,None,:]) 

        y = self.Concat( allOut )
        return y

batchSz = 1
nBands = 3
nFeats = 4
nOutDims = 2
N = 20

model = sharedLSTM(batchSz, nBands, nFeats, nOutDims)
model.compile(optimizer='SGD', loss='mae')

x = np.random.rand(batchSz, N, nBands, nFeats)
x1 = x[:, :N//2, :, :]
x2 = x[:, N//2:, :, :]

aa = model.predict(x)

model.reset_states()
a1 = model.predict(x1)
a2 = model.predict(x2)

print('(with reset) first half abs diff: {}'.format(str(np.sum(np.abs(a1 - aa[:,:N//2,:,:])))))
print('(with reset) second half abs diff: {}'.format(str(np.sum(np.abs(a2 - aa[:,N//2:,:,:])))))

如果您现在问“为什么不使用 Torch 并闭嘴？”，答案是假设 Keras 已经构建了周围的实验框架，并且对其进行更改将是不可忽略的工作量。

score 2 · Accepted Answer

根据我目前对 Keras 中 LSTM（和其他 RNN）行为的理解，在一种stateful=True模式下使用共享 LSTM 层并不能像预期的那样工作，并且只有一个状态变量通过所有并行使用得到更新。所以问题的答案似乎是：

对，他们是。该处理在许多并行序列之一上运行，在末尾存储状态，并将其用作第二个并行序列的初始状态，依此类推。
是的，但它需要一些工作。详情见下文。

我设法以两种方式完成了对状态的处理。首先是从 Keras 的 LSTM 和 LSTMCell 派生子类，并重载 LSTMCell.call() 通过拆分输入、存储和恢复每个并行流的状态来处理并行数据流。这里的一个缺点是 RNN 的输入形状固定为 3D，这意味着需要将并行输入与真实特征一起重新调整为特征维度。

第二种方法是创建一个与问题中的 sharedLSTM 模型不完全不同的包装层，包含对并行流的输入切片，为每个流调用具有正确状态的内部 LSTM，并存储返回的状态。列表中的状态存储更新通过插入到 call() 末尾的 add_update() 调用来工作。这个 add_update() 不（似乎）与模型一起工作，因此层。但是，当 Keras <2.3 运行时，嵌套层的权重不会被跟踪或更新，因此需要 Keras 2.3+ 或 TF2。

python - Keras 中共享 LSTM 层中的状态持久性

1 回答 1

Related

Reference