1

我尝试使用 seq2seq 模型来处理视频字幕问题。您可能知道没有使用新(1.40+ 版本)tensorflow 来执行此操作的更新版本。我只是重写整个事情。为了实现注意力,我只是尝试了“Luong”和“Bahdanau”。我试图通过打印正在发生的事情来可视化培训过程。这是 10 个 epoch 后的结果。

Current caption: <bos> 这个 摇 学会 了 可以 去 跟 你 的 她 告白 哦 <eos>
Predicted caption: <bos>这个摇学会了可以去跟你的她群魔乱舞哦<eos><pad><pad><pad><pad><pad><pad><pad>***

(这是一个中文应用程序。结果相当不错)。

奇怪的点:

  1. 我的预测中根本没有“eos”标签。
  2. 它可以产生与我的 max_target_sequence_length 集一样多的内容。
  3. 尝试了不同类型的解码。完全没有区别。

这是示例代码。

def build_model(self):
    # define placeholders
    self.encoder_inputs = tf.placeholder(tf.float32, [self.batch_size, self.n_frame, self.dim_image], name='encoder_inputs')
    self.encoder_inputs_length = tf.placeholder(tf.int32, [None], name='encoder_inputs_length')
    self.decoder_targets = tf.placeholder(tf.int32, [self.batch_size, None], name='decoder_targets')
    self.decoder_targets_length = tf.placeholder(tf.int32, [None], name='decoder_targets_length')
    self.max_target_sequence_length = tf.reduce_max(self.decoder_targets_length, name='max_target_len')
    self.mask = tf.sequence_mask(self.decoder_targets_length, self.max_target_sequence_length, dtype=tf.float32, name='masks')

    # define variables
    with tf.device('/cpu:0'): 
        self.decoder_embedding = tf.get_variable('decoder_embedding', [self.n_words, self.decoder_embedding_size])
        # define lookup table earlier.
        decoder_inputs_embedded = tf.nn.embedding_lookup(self.decoder_embedding, self.decoder_targets)

        self.encoder_image_W = tf.get_variable('encoder_image_W', [self.dim_image, self.dim_hidden])
        self.encoder_image_b = tf.get_variable('encoder_image_b', [self.dim_hidden])

    ####################################### Encoder ##############################################################

    # build image embedding 
    video_flat = tf.reshape(self.encoder_inputs, [-1, self.dim_image])
    image_emb = tf.nn.xw_plus_b(video_flat, self.encoder_image_W, self.encoder_image_b)
    # print image_emb.shape
    encoder_inputs_now = tf.reshape(image_emb, [self.batch_size, self.n_frame, self.dim_hidden]) # change inputs here 
    # print encoder_inputs_now
    # exit(-1)
    if self.encoder_mode == "bi_directional":
        with tf.variable_scope('encoder'):
            # Construct forward and backward cells
            forward_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
            backward_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)

            bi_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
                forward_cell, backward_cell, encoder_inputs_now, time_major=False, dtype=tf.float32)
            encoder_outputs = tf.concat(bi_outputs, -1)

    elif self.encoder_mode == "multi_layer":
        with tf.variable_scope('encoder'):
            def create_cell():
                basic_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(self.dim_hidden), output_keep_prob=self.keep_prob)
                return basic_cell
            encoder_cell = tf.contrib.rnn.MultiRNNCell([create_cell() for _ in range(self.num_layers)])


            # encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, self.encoder_inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32)
    ##############################################################################################################


    ####################################### Decoder ##############################################################
    with tf.variable_scope('decoder'):
        if self.beam_search:
            print "using beam search inferencing...................."
            encoder_outputs = tf.contrib.seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_size)
            encoder_inputs_length = tf.contrib.seq2seq.tile_batch(self.encoder_inputs_length, multiplier=self.beam_size)
            # encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state)
            encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, self.beam_size)
        # else: just use what is there.

        # create attention mechanism
        if self.attention == "Luong":
        # use Luong attention here
        # Create an attention mechanism
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(self.dim_hidden, encoder_outputs, memory_sequence_length=self.encoder_inputs_length)
        elif self.attention == "Bahdanau":
            # use Bahdanau attention here
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=self.dim_hidden, memory=encoder_outputs, memory_sequence_length=self.encoder_inputs_length)

        # get decoder cells ready
        # this function is crate for multiRNNCell to work. otherwise, it wont copy params while copying
        def create_cell():
            basic_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(self.dim_hidden), output_keep_prob=self.keep_prob)
            return basic_cell
        decoder_cell = tf.contrib.rnn.MultiRNNCell([create_cell() for _ in range(self.num_layers)])
        # get wrapper to help
        decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism,
                                                           attention_layer_size=self.dim_hidden, name='Attention_Wrapper')


        # expand batch size if beam size is used
        # batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size
        if self.beam_search:
            decoder_initial_state = decoder_cell.zero_state(self.batch_size * self.beam_size, tf.float32)
            decoder_initial_state = decoder_initial_state.clone(cell_state = encoder_state)
        else:
            # define intial state. connect with encoder final state
            decoder_initial_state = decoder_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32).clone(cell_state=encoder_state)
        # projection layer
        output_layer = tf.layers.Dense(self.n_words, kernel_initializer=tf.truncated_normal_initializer(
                        mean=0.0, stddev=0.1))

        # get basic decoder to work first
        # decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
        if self.mode == "train":
            # define seq2seq training helper
            training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_inputs_embedded,
                                                                sequence_length=self.decoder_targets_length,
                                                                        time_major=False, name='training_helper')
            training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper,
                                                               initial_state=decoder_initial_state,
                                                               output_layer=output_layer)
            decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder, impute_finished=True,
                                                   maximum_iterations=self.max_target_sequence_length)
            # get logits
            self.decoder_logits_train = tf.identity(decoder_outputs.rnn_output)

            # define the loss
            self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets,weights=self.mask)

            # summary
            tf.summary.scalar('loss', self.loss)
            self.summary_op = tf.summary.merge_all()

            # optimizer
            optimizer = tf.train.AdamOptimizer(self.learning_rate)
            trainable_params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, trainable_params)
            clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
            self.train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params))

        elif self.mode == 'inference':
            # start_tokens = tf.ones([self.batch_size, ], tf.int32) * self.vocab_w2ix['<bos>']
            start_tokens = tf.tile(tf.constant([1], dtype=tf.int32), [self.batch_size])
            # end_token = self.vocab_w2ix['<eos>']
            end_token = tf.constant(2, dtype = tf.int32)
            # beam search decode
            if self.beam_search:
                # Replicate encoder infos beam_width times
                # decoder_initial_state = tf.contrib.seq2seq.tile_batch(
                #     encoder_state, multiplier=self.beam_size)


                # Define a beam-search decoder
                inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=decoder_cell,
                                                                        embedding=self.decoder_embedding,
                                                                        start_tokens=start_tokens,
                                                                        end_token=end_token,
                                                                        initial_state=decoder_initial_state,
                                                                        beam_width=self.beam_size,
                                                                        output_layer=output_layer,
                                                                        length_penalty_weight=8.0)

                # Dynamic decoding
                inference_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, maximum_iterations=21, impute_finished = False)
                # self.decoder_predict_decode = inference_decoder_outputs.predicted_ids
                self.decoder_predict_decode = tf.identity(inference_decoder_outputs.predicted_ids)

            # ordinary greedy decoding
            else:

                decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.decoder_embedding, start_tokens=start_tokens,end_token=end_token)
                inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=decoding_helper,
                                                                    initial_state=decoder_initial_state, output_layer=output_layer)
                inference_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder, maximum_iterations=40, impute_finished = False)
                # self.decoder_predict_decode = tf.expand_dims(inference_decoder_outputs.sample_id, -1)
                # self.decoder_predict_decode = tf.identity(inference_decoder_outputs.sample_id)
                self.decoder_predict_decode = tf.identity(inference_decoder_outputs.rnn_output)
4

0 回答 0