2

我正在尝试使用 CTC loss 在 keras 中实现百度的 DeepSpeech1,我的代码如下:

class dataGen(Sequence): # data generator for Mozilla common voice
    def __init__(self, audiopaths, transcripts, batch_size):
        self.x = audiopaths
        self.y = transcripts
        self.batch_size = batch_size

    def __len__(self):
        return int(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx*self.batch_size : (idx+1)*self.batch_size]
        batch_y = self.y[idx*self.batch_size : (idx+1)*self.batch_size]

        x_val = [get_max_time(file_name) for file_name in batch_x]
        max_val = max(x_val)

        x_data = np.array([make_mfcc_shape(file_name, padlen=max_val) for file_name in batch_x]) # just converts data to mdcc

        y_val = [get_maxseq_len(l) for l in batch_y]
        max_y = max(y_val)

        labels = np.array([get_intseq(l, max_intseq_length=max_y) for l in batch_y])

        input_length = np.array(x_val)

        label_length = np.array(y_val)

        return [x_data, labels, input_length, label_length], np.zeros((self.batch_size,)), [None] 

  def on_epoch_end(self):
      i = np.arange(len(self.x))
      np.random.shuffle(i)
      self.x = self.x[i]
      self.y = self.y[I]


def clipped_relu(x):
    return relu(x, max_value=20)

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return ctc_batch_cost(labels, y_pred, input_length, label_length)

def ctc(y_true, y_pred):
    return y_pred

input_data = Input(name='the_input', shape=(None, 26))

inner = TimeDistributed(Dense(2048))(input_data)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)
inner = TimeDistributed(Dense(2048))(inner)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)
inner = TimeDistributed(Dense(2048))(inner)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)

inner = Bidirectional(LSTM(2048, return_sequences=True))(inner)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)

output = TimeDistributed(Dense(28, activation="softmax"))(inner)

labels = Input(name='the_labels', shape=[None,])
input_length = Input(name='input_length', shape=[1])
label_length = Input(name='label_length', shape=[1])

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc'). ([output, labels,     input_length, label_length])

model = Model(inputs=[input_data, labels, input_length, label_length],     outputs=loss_out)

model.compile(optimizer='adam', loss=ctc)

这都是非常标准的,但是在训练期间,我的模型通常会达到 100 到 200 之间的损失(从 >1000)然后停止改进,当我测试它时(删除 lambda 层以获取转录输出),它只输出空白字符。

我的理论是,它训练只输出空白字符,因为这比随机字符的损失更低,但随后会卡在那里的局部最小值,实际上并没有学会转录音频。

有人知道有什么技巧可以解决这个问题吗?

4

0 回答 0