keras - 如何使用编码器-解码器 lstm 正确生成序列？

Question

我正在实施一些代码来为自然语言理解 (NLU) 生成标签数据，来自文章“使用编码器-解码器 LSTM 进行语义槽填充的标记数据生成”（https://pdfs.semanticscholar.org/7ffe/83d7dd3a474e15ccc2aef412009f100a5802.pdf） . 我的架构是一个简单的编码器-解码器 LSTM，但由于我生成的句子（用于单词和标签）不正确，我试图生成完全相同的句子（仅单词）作为输入。不幸的是，这不能正常工作。

我正在使用 vord2vec 进行词嵌入，并且嵌入的维度设置为 64（如文章中所建议的那样）。编码器 LSTM 以相反的顺序接收序列，丢失率为 0.5。解码器 LSTM 还具有 0.5 的 dropout 率和用于序列的每个输出的 softmax 层以映射最可能的单词。输入与目标（相同的句子）完全相同，因为首先我想产生完全相同的句子。

对于训练，我使用 Adam 优化器和 categorical_crossentropy 进行损失。对于推理，我在生成序列时使用了波束搜索 (B=3)。

我的训练代码：

def pretrained_embedding_layer(emb):
    vocab_len = len(emb)
    emb_dim = len(emb[0])
    emb_layer = Embedding(vocab_len, emb_dim, trainable = False)
    emb_layer.build((None,))
    emb_layer.set_weights([emb])

    return emb_layer

LSTM_encoder = LSTM(1024, dropout=0.5, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, dropout=0.5, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")

K.set_learning_phase(1)

def model1_enc_dec(input_shape, w_emb):
     words_indices = Input(shape=input_shape, dtype='int32')
     wemb_layer = pretrained_embedding_layer(w_emb)
     wemb = wemb_layer(words_indices)
     enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
     encoder_states = [enc_state_h, enc_state_c]
     dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
     initial_state=encoder_states)
     dec_out = dense_w(dec_out)
     model1 = Model(inputs=[words_indices], outputs=[dec_out])

     return model1

model = model1_enc_dec((maxlen,), w_emb, s_emb)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
    model.fit(train_w, train_lab_w, validation_data=(val_w, val_lab_w), epochs=epochs, verbose=1, shuffle=True)

我的推理代码：

wemb_layer = Embedding(len(w_emb), len(w_emb[0]), trainable=False)
wemb_layer.build((None,))
LSTM_encoder = LSTM(1024, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")

def target_model(input_shape):
    words_indices = Input(shape=input_shape, dtype='int32')
    wemb = wemb_layer(words_indices)
    enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
    encoder_states = [enc_state_h, enc_state_c]
    dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
    initial_state=encoder_states)
    dec_out = dense_w(dec_out)
    model = Model(inputs=[words_indices], outputs=[dec_out])

    return model

target_model = target_model((maxlen,))
wemb_layer.set_weights(model1.layers[1].get_weights()) # layer 0: input
LSTM_encoder.set_weights(model1.layers[2].get_weights())
LSTM_decoder.set_weights(model1.layers[3].get_weights())
dense_w.set_weights(model1.layers[4].get_weights())

def model1_enco_infe(input_shape):
    words_indices = Input(shape=input_shape, dtype='int32')
    wemb = wemb_layer(words_indices)
    enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
    encoder_model = Model(inputs=[words_indices], outputs=[enc_state_h, 
    enc_state_c])

    return encoder_model

def model1_deco_infe(input_shape):
    dec_word_input = Input(shape=input_shape, dtype='int32')
    dec_state_input_h = Input(shape=(1024,))
    dec_state_input_c = Input(shape=(1024,))
    wemb = wemb_layer(dec_word_input)
    dec_states_input = [dec_state_input_h, dec_state_input_c]
    dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
    initial_state=dec_states_input)
    dec_states_output = [dec_state_h, dec_state_c]
    deco_out = dense_w(dec_out)
    decoder_model = Model(inputs=[dec_word_input] + dec_states_input, outputs= 
    [deco_out] + dec_states_output)

    return decoder_model

encoder_model = model1_enco_infe((maxlen,))
decoder_model = model1_deco_infe((1,))

def beamsearch_B(deco_w_out, beam):
    words_index = []
    dw = deco_w_out.copy()
    for i in range(beam):
        word_index = np.argmax(dw, axis=-1)
        dw[0][0][word_index[0][0]] = 0
        words_index.append(word_index[0][0])

    return words_index

def generate_model1_add(word_seq, encoder_model, decoder_model, dec_word_input, id2word, beam):
    [enc_state_h, enc_state_c] = encoder_model.predict(word_seq)
    states = [enc_state_h, enc_state_c]
    word_sentence = ''
    probs_word = []
    word_sentences = []
    dec_word_inputs = []
    states_beam = []
    stop_condition = False

    [dec_w_out, dec_state_h, dec_state_c] = 
    decoder_model.predict([dec_word_input] + states)
    words_index, _ = beamsearch_B(dec_w_out, [], beam)

    for i in range(beam):
        probs_word.append(-log(dec_w_out[0][0][words_index[i]]))
        word_sentences.append(id2word[words_index[i]])
        dec_word_inputs.append([words_index[i]])
        states_beam.append([dec_state_h, dec_state_c])

        n_words = 1
        endgame = []

        while not stop_condition:
            words_indexes, words_sentences, probs_words, states_b = [], [], 
            [], []
            for k in range(beam):
                [dec_w_out, dec_state_h, dec_state_c] = 
                decoder_model.predict([dec_word_inputs[k]] + states_beam[k])
                words_index, _ = beamsearch_B(dec_w_out, [], beam)
                states = [dec_state_h, dec_state_c]

                for j in range(beam):
                    words_indexes.append(words_index[j])
                    probs_words.append(probs_word[k] * -log(dec_w_out[0][0] 
                    [words_index[j]]) + 1e-7)
                    words_sentences.append(word_sentences[k] + ' ' + 
                    id2word[words_index[j]])
                    states_b.append(states)

            probs = []
            for i in range(len(probs_words)):
                probs.append(1 / (probs_words[i]))
            indexes = []
            for i in range(beam):
                index = np.argmax(probs, axis=-1)
                probs[index] = 0
                indexes.append(index)

            for i in range(beam):
                probs_word[i] = probs_words[indexes[i]]
                word_sentences[i] = words_sentences[indexes[i]]
                dec_word_inputs[i] = [words_indexes[indexes[i]]]
                states_beam[i] = states_b[indexes[i]]
                if (id2word[words_indexes[indexes[i]]] == 'EOS'):
                    endgame.append(i)

            if len(endgame) == 1:
                word_sentence = word_sentences[endgame]
                stop_condition = True
            elif len(endgame) > 1:
                word_sentence = word_sentences[np.min(endgame)]
                stop_condition = True

            n_words += 1

            if n_words > 50:
                word_sentence = word_sentences[0]
                stop_condition = True

    return word_sentence

word_sentence = generate_model1_add(np.reshape(train_w[i], (1, maxlen)), 
                encoder_model, 0, decoder_model, [w2i['BOS']], i2w, 3)

我生成的序列的一个例子：

输入语句：BOS i 'm Fourth in flight from boston to atlanta EOS PAD PAD PAD ... 生成语句：BOS from from from from from from from from from from from from from from from from from ...

似乎训练权重不正确，但在训练期间我得到了 loss: 0.0032 - acc: 0.9990 - val_loss: 0.0794 - val_acc: 0.9888。

我想要的只是生成与输入完全相同的句子。希望你们能帮助我。先感谢您！

keras - 如何使用编码器-解码器 lstm 正确生成序列？

0 回答 0

Related

Reference