我尝试使用 seq2seq 模型来处理视频字幕问题。您可能知道没有使用新(1.40+ 版本)tensorflow 来执行此操作的更新版本。我只是重写整个事情。为了实现注意力,我只是尝试了“Luong”和“Bahdanau”。我试图通过打印正在发生的事情来可视化培训过程。这是 10 个 epoch 后的结果。
Current caption: <bos> 这个 摇 学会 了 可以 去 跟 你 的 她 告白 哦 <eos>
Predicted caption: <bos>这个摇学会了可以去跟你的她群魔乱舞哦<eos><pad><pad><pad><pad><pad><pad><pad>***
(这是一个中文应用程序。结果相当不错)。
奇怪的点:
- 我的预测中根本没有“eos”标签。
- 它可以产生与我的 max_target_sequence_length 集一样多的内容。
- 尝试了不同类型的解码。完全没有区别。
这是示例代码。
def build_model(self):
# define placeholders
self.encoder_inputs = tf.placeholder(tf.float32, [self.batch_size, self.n_frame, self.dim_image], name='encoder_inputs')
self.encoder_inputs_length = tf.placeholder(tf.int32, [None], name='encoder_inputs_length')
self.decoder_targets = tf.placeholder(tf.int32, [self.batch_size, None], name='decoder_targets')
self.decoder_targets_length = tf.placeholder(tf.int32, [None], name='decoder_targets_length')
self.max_target_sequence_length = tf.reduce_max(self.decoder_targets_length, name='max_target_len')
self.mask = tf.sequence_mask(self.decoder_targets_length, self.max_target_sequence_length, dtype=tf.float32, name='masks')
# define variables
with tf.device('/cpu:0'):
self.decoder_embedding = tf.get_variable('decoder_embedding', [self.n_words, self.decoder_embedding_size])
# define lookup table earlier.
decoder_inputs_embedded = tf.nn.embedding_lookup(self.decoder_embedding, self.decoder_targets)
self.encoder_image_W = tf.get_variable('encoder_image_W', [self.dim_image, self.dim_hidden])
self.encoder_image_b = tf.get_variable('encoder_image_b', [self.dim_hidden])
####################################### Encoder ##############################################################
# build image embedding
video_flat = tf.reshape(self.encoder_inputs, [-1, self.dim_image])
image_emb = tf.nn.xw_plus_b(video_flat, self.encoder_image_W, self.encoder_image_b)
# print image_emb.shape
encoder_inputs_now = tf.reshape(image_emb, [self.batch_size, self.n_frame, self.dim_hidden]) # change inputs here
# print encoder_inputs_now
# exit(-1)
if self.encoder_mode == "bi_directional":
with tf.variable_scope('encoder'):
# Construct forward and backward cells
forward_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
backward_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
bi_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
forward_cell, backward_cell, encoder_inputs_now, time_major=False, dtype=tf.float32)
encoder_outputs = tf.concat(bi_outputs, -1)
elif self.encoder_mode == "multi_layer":
with tf.variable_scope('encoder'):
def create_cell():
basic_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(self.dim_hidden), output_keep_prob=self.keep_prob)
return basic_cell
encoder_cell = tf.contrib.rnn.MultiRNNCell([create_cell() for _ in range(self.num_layers)])
# encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, self.encoder_inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32)
##############################################################################################################
####################################### Decoder ##############################################################
with tf.variable_scope('decoder'):
if self.beam_search:
print "using beam search inferencing...................."
encoder_outputs = tf.contrib.seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_size)
encoder_inputs_length = tf.contrib.seq2seq.tile_batch(self.encoder_inputs_length, multiplier=self.beam_size)
# encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state)
encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, self.beam_size)
# else: just use what is there.
# create attention mechanism
if self.attention == "Luong":
# use Luong attention here
# Create an attention mechanism
attention_mechanism = tf.contrib.seq2seq.LuongAttention(self.dim_hidden, encoder_outputs, memory_sequence_length=self.encoder_inputs_length)
elif self.attention == "Bahdanau":
# use Bahdanau attention here
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=self.dim_hidden, memory=encoder_outputs, memory_sequence_length=self.encoder_inputs_length)
# get decoder cells ready
# this function is crate for multiRNNCell to work. otherwise, it wont copy params while copying
def create_cell():
basic_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(self.dim_hidden), output_keep_prob=self.keep_prob)
return basic_cell
decoder_cell = tf.contrib.rnn.MultiRNNCell([create_cell() for _ in range(self.num_layers)])
# get wrapper to help
decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism,
attention_layer_size=self.dim_hidden, name='Attention_Wrapper')
# expand batch size if beam size is used
# batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size
if self.beam_search:
decoder_initial_state = decoder_cell.zero_state(self.batch_size * self.beam_size, tf.float32)
decoder_initial_state = decoder_initial_state.clone(cell_state = encoder_state)
else:
# define intial state. connect with encoder final state
decoder_initial_state = decoder_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32).clone(cell_state=encoder_state)
# projection layer
output_layer = tf.layers.Dense(self.n_words, kernel_initializer=tf.truncated_normal_initializer(
mean=0.0, stddev=0.1))
# get basic decoder to work first
# decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
if self.mode == "train":
# define seq2seq training helper
training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_inputs_embedded,
sequence_length=self.decoder_targets_length,
time_major=False, name='training_helper')
training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper,
initial_state=decoder_initial_state,
output_layer=output_layer)
decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder, impute_finished=True,
maximum_iterations=self.max_target_sequence_length)
# get logits
self.decoder_logits_train = tf.identity(decoder_outputs.rnn_output)
# define the loss
self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets,weights=self.mask)
# summary
tf.summary.scalar('loss', self.loss)
self.summary_op = tf.summary.merge_all()
# optimizer
optimizer = tf.train.AdamOptimizer(self.learning_rate)
trainable_params = tf.trainable_variables()
gradients = tf.gradients(self.loss, trainable_params)
clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
self.train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params))
elif self.mode == 'inference':
# start_tokens = tf.ones([self.batch_size, ], tf.int32) * self.vocab_w2ix['<bos>']
start_tokens = tf.tile(tf.constant([1], dtype=tf.int32), [self.batch_size])
# end_token = self.vocab_w2ix['<eos>']
end_token = tf.constant(2, dtype = tf.int32)
# beam search decode
if self.beam_search:
# Replicate encoder infos beam_width times
# decoder_initial_state = tf.contrib.seq2seq.tile_batch(
# encoder_state, multiplier=self.beam_size)
# Define a beam-search decoder
inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=decoder_cell,
embedding=self.decoder_embedding,
start_tokens=start_tokens,
end_token=end_token,
initial_state=decoder_initial_state,
beam_width=self.beam_size,
output_layer=output_layer,
length_penalty_weight=8.0)
# Dynamic decoding
inference_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, maximum_iterations=21, impute_finished = False)
# self.decoder_predict_decode = inference_decoder_outputs.predicted_ids
self.decoder_predict_decode = tf.identity(inference_decoder_outputs.predicted_ids)
# ordinary greedy decoding
else:
decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.decoder_embedding, start_tokens=start_tokens,end_token=end_token)
inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=decoding_helper,
initial_state=decoder_initial_state, output_layer=output_layer)
inference_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder, maximum_iterations=40, impute_finished = False)
# self.decoder_predict_decode = tf.expand_dims(inference_decoder_outputs.sample_id, -1)
# self.decoder_predict_decode = tf.identity(inference_decoder_outputs.sample_id)
self.decoder_predict_decode = tf.identity(inference_decoder_outputs.rnn_output)