我的 RNN Gan 网络由两个 RNN 网络、一个生成器和一个鉴别器组成,用于生成音频。但它正在做的事情与它要做的事情完全相反,这真的很奇怪。鉴别器损失正在减少,而生成器损失正在增加,所以它正在做的事情完全相反。但它变得更加奇怪,因为损失实际上是 dec/inc 线性的,如果他们应该这样做,但显然他们不是。那么,这种行为是由什么原因引起的呢?我换东西了吗?
损失:https ://pastebin.com/78BmS8iK
鉴别器/生成器代码:
def build_audio_generator(frame_size):
model = Sequential()
model.add(LSTM(512, input_shape=(frame_size, 1), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(256*frame_size))
model.add(Activation('softmax'))
model.add(Reshape((frame_size, 256)))
model.summary()
noise = Input(shape=(frame_size, 1))
sound = model(noise)
return Model(noise, sound)
def build_audio_discriminator(audio_shape):
model = Sequential()
model.add(Conv1D(32, kernel_size=(2), padding="same", input_shape=audio_shape))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128))
model.summary()
audio = Input(shape=audio_shape)
# Extract feature representation
features = model(audio)
# Determine validity and label of the audio
validity = Dense(1, activation="sigmoid")(features)
return Model(audio, validity)
主要(结合两种模型):
frame_size = 500
frame_shift = 128
num_classes = 1
audio_shape_disc = (frame_size,256)
optimizer = Adam(0.0002, 0.5)
losses = ['binary_crossentropy']
# Build and compile the discriminator
audio_discriminator = build_audio_discriminator(audio_shape_disc)
audio_discriminator.compile(loss=losses, optimizer=optimizer, metrics=['accuracy'])
# Build the generator
audio_generator = build_audio_generator(frame_size)
# The generator takes noise
noise = Input(shape=(frame_size, 1))
audio = audio_generator(noise)
# For the combined model we will only train the generator
audio_discriminator.trainable = False
# The discriminator takes generated audio as input and determines validity
audio_valid = audio_discriminator(audio)
# The combined model (stacked generator and discriminator) takes
# noise as input => generates audio => determines validity
audio_combined = Model(noise, audio_valid)
audio_combined.compile(loss=losses, optimizer=optimizer)
train(audio_generator, audio_discriminator, audio_combined, epochs, frame_size, frame_shift)
训练(train()函数缩小):
for s in range(0, Y.shape[1], frame_size):
audio = Y
epoch_counter += 1
if epoch_counter>epochs: break;
audio_frame = audio[:, s:s+500, :]
if audio_frame.shape[1]<frame_size: break;
noise = np.random.normal(0, 1, (1, frame_size, 1))
# Generate a half batch of new images
gen_audio = generator.predict(noise)
valid = np.ones((1, int(frame_size/2), 1))
fake = np.zeros((1, int(frame_size/2), 1))
# Train the discriminator
d_loss_real = discriminator.train_on_batch(audio_frame, valid)
d_loss_fake = discriminator.train_on_batch(gen_audio, fake)
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
# Train the generator
g_loss = combined.train_on_batch(noise, valid)
# Plot the progress
print(str(epoch_counter) + '/' + str(epochs) + ' > Discriminator loss: ' + str(d_loss[0]) + ' | Generator loss: ' + str(g_loss))