我正在尝试使用 Clevr 数据集 ( https://cs.stanford.edu/people/jcjohns/clevr/ ) 为深度学习作业构建视觉问答模型(虽然我不能使用函数式程序表示)。
但是,我正在苦苦挣扎,因为我的网络没有正确学习,因为它在训练集和验证集上继续在 0.2 左右的准确度上下波动。此外,在我第一次运行它时,它使用类似的架构达到了 0.4 的准确度,但我怀疑,与嵌入矩阵构造和标记器的试验相比,内存中仍然存在一些不同的东西。
我已经尝试过更改嵌入(现在我正在使用 Glove)、更改维度、以多种方式更改网络(我还必须尝试注意力和更高级的东西,但首先我希望看到它能够正常工作)。我确信存在某种致命错误(架构也很幼稚),但我似乎无法发现它。你能帮我理解什么不起作用吗?
我将把我的网络代码和数据输入管道留在下面,请评论甚至指出我在哪里使用了一些不好的做法。如果我会留下很多代码,我很抱歉,但我真的不明白我在哪里做错了。
先感谢您。
这是网络的代码
import tensorflow as tf
batch_size = 8
epochs = 100
#arch = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
arch = tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
freeze_until = 650
for layer in arch.layers[:freeze_until]:
layer.trainable = False
branch1 = arch.output
branch1 = tf.keras.layers.GlobalAveragePooling2D(data_format=None) (branch1)
text_inputs = tf.keras.Input(shape=[max_words])
emb = tf.keras.layers.Embedding(vocab_size,embedding_dim,
input_length=max_words,
weights=[embedding_matrix],
trainable=False) (text_inputs)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(emb)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
#branch2 = tf.keras.layers.Conv1D(128, 5, activation='relu')(emb)
branch2 = tf.keras.layers.LSTM(128)(emb)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(branch2)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
joint = tf.keras.layers.concatenate([branch1, branch2])
joint = tf.keras.layers.Dense(512, activation='relu')(joint)
joint = tf.keras.layers.Dropout(0.2)(joint)
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint)
model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])
model.summary()
loss = tf.keras.losses.CategoricalCrossentropy()
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss = loss,
optimizer = optimizer,
metrics = ['accuracy'])
callbacks=[]
callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True))
callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))
model.fit_generator(data_generator('train'),validation_data = data_generator('validation'), steps_per_epoch= 240, validation_steps = 120, epochs=epochs, callbacks=callbacks, verbose=1)
这是生成器+嵌入的代码
import json
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
img_h = 320
img_w = 480
max_words = 100
embedding_dim = 40
num_classes = 13
val_split = 0.8
max_len = 25
classes = [ '0',
'1',
'10',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'no',
'yes'
]
label_encoder = LabelEncoder()
integer_encoder_ = label_encoder.fit(classes)
integer_encoded = integer_encoder_.transform(classes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder_ = onehot_encoder.fit(integer_encoded)
def data_generator(mode, batch_size = 8):
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
while True:
# Select files (paths/indices) for the batch
if mode == 'validation':
batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size)
elif mode == 'train':
batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size)
else:
batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size)
batch_input_img = []
batch_input_txt = []
batch_output = []
for i in batch_addresses:
image_name = data_raw['questions'][i]['image_filename']
img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB')
img_array = np.array(img)
img_array = np.expand_dims(img_array, 0)
input_img = np.true_divide(img_array,255)
input_txt = data_raw['questions'][i]['question']
output = data_raw['questions'][i]['answer']
batch_input_img += [ input_img ]
batch_input_txt += [ input_txt ]
# Return a tuple of (input,output) to feed the network
batch_x_img = np.array( batch_input_img )
batch_x_txt = np.array( batch_input_txt )
batch_x_img = batch_x_img[:,-1]
tokenized = tokenizer.texts_to_sequences(batch_x_txt)
batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len)
batch_output += [ output ]
batch_y = np.array( batch_output )
y_c = integer_encoder_.transform(batch_y)
y_c = y_c.reshape(len(y_c), 1)
onehot_encoded = onehot_encoder_.transform(y_c)
batch_y = onehot_encoded
yield ([batch_x_img,batch_x_txt], batch_y )
def test_generator():
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
i = 0
while (i<=len(data_raw['questions'])):
batch_input_img = []
batch_input_txt = []
batch_output = []
image_name = data_raw['questions'][i]['image_filename']
img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB')
img_array = np.array(img)
img_array = np.expand_dims(img_array, 0)
input_img = np.true_divide(img_array,255)
input_txt = data_raw['questions'][i]['question']
output = data_raw['questions'][i]['question_id']
batch_input_img += [ input_img ]
batch_input_txt += [ input_txt ]
# Return a tuple of (input,output) to feed the network
batch_x_img = np.array( batch_input_img )
batch_x_txt = np.array( batch_input_txt )
batch_x_img = batch_x_img[:,-1]
tokenized = tokenizer.texts_to_sequences(batch_x_txt)
batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len)
batch_y = output
i+=1
yield ([batch_x_img,batch_x_txt], batch_y )
def create_embedding_matrix(filepath, word_index, embedding_dim):
vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath) as f:
count = 0
for line in f:
word, *vector = line.split()
if word in word_index and count<(len(word_index)-1):
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
count = count + 1
# errore perché va a splittare e trova to name.domain
return embedding_matrix
def create_tokens(tokenizer):
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
tot_txt = []
for i in range(len(data_raw['questions'])):
input_txt = data_raw['questions'][i]['question']
tot_txt += [input_txt]
tokenizer.fit_on_texts(tot_txt)
return tokenizer
tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV')
tokenizer = create_tokens(tokenizer)
#embedding_matrix = create_embedding_matrix('/kaggle/input/embedding/embedding.txt', tokenizer.word_index, embedding_dim)
import os
filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0]
embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim)
vocab_size = len(tokenizer.word_index) + 1
reader = data_generator('train')
PS 我认为将 GlobalAveragePooling 层更改为 Flattening 层可能会解决它,但它没有。