1

我的 RNN Gan 网络由两个 RNN 网络、一个生成器和一个鉴别器组成,用于生成音频。但它正在做的事情与它要做的事情完全相反,这真的很奇怪。鉴别器损失正在减少,而生成器损失正在增加,所以它正在做的事情完全相反。但它变得更加奇怪,因为损失实际上是 dec/inc 线性的,如果他们应该这样做,但显然他们不是。那么,这种行为是由什么原因引起的呢?我换东西了吗?

损失:https ://pastebin.com/78BmS8iK

鉴别器/生成器代码:

def build_audio_generator(frame_size):
model = Sequential()
model.add(LSTM(512, input_shape=(frame_size, 1), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(256*frame_size))
model.add(Activation('softmax'))
model.add(Reshape((frame_size, 256)))

model.summary()

noise = Input(shape=(frame_size, 1))

sound = model(noise)

return Model(noise, sound)

def build_audio_discriminator(audio_shape):
model = Sequential()
model.add(Conv1D(32, kernel_size=(2), padding="same", input_shape=audio_shape))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128))

model.summary()

audio = Input(shape=audio_shape)

# Extract feature representation
features = model(audio)

# Determine validity and label of the audio
validity = Dense(1, activation="sigmoid")(features)

return Model(audio, validity)

主要(结合两种模型):

    frame_size = 500
    frame_shift = 128

    num_classes = 1

    audio_shape_disc = (frame_size,256)

    optimizer = Adam(0.0002, 0.5)
    losses = ['binary_crossentropy']

    # Build and compile the discriminator
    audio_discriminator = build_audio_discriminator(audio_shape_disc)
    audio_discriminator.compile(loss=losses, optimizer=optimizer, metrics=['accuracy'])

    # Build the generator
    audio_generator = build_audio_generator(frame_size)

    # The generator takes noise
    noise = Input(shape=(frame_size, 1))

    audio = audio_generator(noise)

    # For the combined model we will only train the generator
    audio_discriminator.trainable = False

    # The discriminator takes generated audio as input and determines validity
    audio_valid = audio_discriminator(audio)

    # The combined model  (stacked generator and discriminator) takes
    # noise as input => generates audio => determines validity
    audio_combined = Model(noise, audio_valid)
    audio_combined.compile(loss=losses, optimizer=optimizer)

    train(audio_generator, audio_discriminator, audio_combined, epochs, frame_size, frame_shift)

训练(train()函数缩小):

      for s in range(0, Y.shape[1], frame_size):

            audio = Y
            epoch_counter += 1


            if epoch_counter>epochs: break;

            audio_frame = audio[:, s:s+500, :]
            if audio_frame.shape[1]<frame_size: break;

            noise = np.random.normal(0, 1, (1, frame_size, 1))


            # Generate a half batch of new images
            gen_audio = generator.predict(noise)

            valid = np.ones((1, int(frame_size/2), 1))

            fake = np.zeros((1, int(frame_size/2), 1))


            # Train the discriminator
            d_loss_real = discriminator.train_on_batch(audio_frame, valid)
            d_loss_fake = discriminator.train_on_batch(gen_audio, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train the generator
            g_loss = combined.train_on_batch(noise, valid)

            # Plot the progress
            print(str(epoch_counter) + '/' + str(epochs) + ' > Discriminator loss: ' + str(d_loss[0]) + ' | Generator loss: ' +  str(g_loss))
4

0 回答 0