0

我正在尝试将我前段时间在 Colab(使用 ImageDataGenerator)上所做的工作簿转换为使用 tf.data.dataset 的工作簿,因为我现在设置了多 GPU,并且正在尝试学习如何进行更快的训练。该模型在来自 Kaggle 的年龄/性别/种族数据集上进行训练,但在这种情况下,我们只对性别和年龄预测感兴趣。性别将是 0 或 1,损失函数是二元交叉熵,而年龄是 0 到 120 之间的整数,损失函数是 mse,它是回归。

import tensorflow as tf
import os

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 64

#Load datasets from directories
train_gen = tf.data.Dataset.list_files(os.listdir(training_dir), shuffle = False)
valid_gen = tf.data.Dataset.list_files(os.listdir(validation_dir), shuffle = False)

def decode_img(img):
    #Convert compressed string into a 3D tensor
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    #Resize the image to the desired size
    return tf.image.resize(img, [128,128])


def get_label(file):
    
    gender = get_sex(file) #returns either 0 or 1
    age = get_age(file)     #returns interger between 0 and about 120
    return gender, age

def process_path(file):
    file = file.numpy()
    file_path = str(bytes.decode(file))
    file = file_path.split(' ')[-1].split("\\")[-1]
    labels = get_label(file)
    # Load data from file as a String
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    img = img / 255.0

    return img, labels


def _set_shapes(t1, t2):
    t1.set_shape((128,128,3))
    t2.set_shape((2,))
        
    return (t1,t2)


train_gen = train_gen.map(lambda x: tf.py_function(process_path, [x], [tf.float32, tf.int32]), num_parallel_calls=AUTOTUNE)
valid_gen = valid_gen.map(lambda x: tf.py_function(process_path, [x], [tf.float32, tf.int32]), num_parallel_calls=AUTOTUNE)

train_gen = train_gen.map(_set_shapes,num_parallel_calls=AUTOTUNE)
valid_gen = valid_gen.map(_set_shapes, num_parallel_calls=AUTOTUNE)


train_gen = train_gen.batch(batch_size)
valid_gen = valid_gen.batch(batch_size)

train_gen

输出:<BatchDataset 形状:((None, 128, 128, 3), (None, 2)),类型:(tf.float32, tf.int32)>

#configure for performance
def config_for_performance(ds):
    ds = ds.cache()
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

train_gen = config_for_performance(train_gen)
valid_gen = config_for_performance(valid_gen)

模型本身:

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Dropout, Input, Activation, Flatten, BatchNormalization, PReLU
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import BinaryCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import mixed_precision

mixed_precision.set_global_policy('mixed_float16')

gpus = tf.config.list_logical_devices('GPU')
#print(gpus)
strategy = tf.distribute.MirroredStrategy(gpus,cross_device_ops=tf.distribute.ReductionToOneDevice())
with strategy.scope():

    #Define the convolution layers
    inp = Input(shape=(128,128,3))
    cl1 = Conv2D(32,(3,3), padding='same', kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(inp)
    bn1 = BatchNormalization()(cl1)
    pr1 = PReLU(alpha_initializer='he_uniform')(bn1)
    cl2 = Conv2D(32,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(pr1)
    bn2 = BatchNormalization()(cl2)
    pr2 = PReLU(alpha_initializer='he_uniform')(bn2)
    mp1 = MaxPool2D((2,2))(pr2)
    cl3 = Conv2D(64,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp1)
    bn3 = BatchNormalization()(cl3)
    pr3 = PReLU(alpha_initializer='he_uniform')(bn3)
    cl4 = Conv2D(64,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(pr3)
    bn4 = BatchNormalization()(cl4)
    pr4 = PReLU(alpha_initializer='he_uniform')(bn4)
    mp2 = MaxPool2D((2,2))(pr4)
    cl5 = Conv2D(128,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp2)
    bn5 = BatchNormalization()(cl5)
    pr5 = PReLU(alpha_initializer='he_uniform')(bn5)
    mp3 = MaxPool2D((2,2))(pr5)
    cl6 = Conv2D(256,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp3)
    bn6 = BatchNormalization()(cl6)
    pr6 = PReLU(alpha_initializer='he_uniform')(bn6)
    mp4 = MaxPool2D((2,2))(pr6)
    cl7 = Conv2D(512,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp4)
    bn7 = BatchNormalization()(cl7)
    pr7 = PReLU(alpha_initializer='he_uniform')(bn7)
    mp5 = MaxPool2D((2,2))(pr7)
    flt = Flatten()(mp5)

    #This layer predicts age
    agelayer = Dense(128, activation='relu',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(flt)
    agelayer = BatchNormalization()(agelayer)
    agelayer = Dropout(0.6)(agelayer)
    agelayer = Dense(1, activation='relu', name='age_output', kernel_initializer='he_uniform', dtype='float32')(agelayer)

    #This layer predicts gender
    glayer = Dense(128, activation='relu',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(flt)
    glayer = BatchNormalization()(glayer)
    glayer = Dropout(0.5)(glayer)
    glayer = Dense(1, activation='sigmoid', name='gender_output', kernel_initializer='he_uniform', dtype='float32')(glayer)

    modelA = Model(inputs=inp, outputs=[glayer,agelayer])

    model_folder = 'C:/Users/mm/OneDrive/Documents/Age estimation & gender classification/models'

    if not os.path.exists(model_folder):
        os.mkdir(model_folder)

    #Callback to control learning rate during training. Reduces learning rate by 5% after 3 epochs of no improvement on validation loss
    lr_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.95, patience=3,min_lr=0.000005)

    #Callback to stop training if after 100 epochs of no improvement it stops and restores the best weights
    es_callback = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True, min_delta=0.001)

    #Compile Model A
    modelA.compile(optimizer='Adam', loss={'gender_output': BinaryCrossentropy(), 'age_output': 'mse'}, metrics={'gender_output': 'accuracy', 'age_output':'mae'})

#Training Model A
history = modelA.fit(train_gen, epochs=100, validation_data=valid_gen, callbacks=[es_callback,lr_callback])

错误信息:

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1' ) Epoch 1/100 INFO:tensorflow:Error 报告给 Coordinator:logits 和标签必须具有相同的形状 ((None, 1) vs (None, 2)) Traceback(最近一次调用最后):文件“C:\Users\ mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\ops\nn_impl.py”,第 130 行,在 sigmoid_cross_entropy_with_logits 标签中。get_shape().assert_is_compatible_with(logits.get_shape()) 文件“C:\Users \mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\framework\tensor_shape.py",第 1161 行,在 assert_is_compatible_with 中引发 ValueError("形状 %s 和 %s 不兼容" % (self, other) ) ValueError: 形状 (None, 2) 和 (None, 1) 不兼容

在处理上述异常的过程中,又出现了一个异常:

Traceback(最近一次调用最后一次):文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\training\coordinator.py”,第 297 行,在 stop_on_exception 产量文件“C: \Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\distribute\mirrored_run.py",第 346 行,运行中 self.main_result = self.main_fn(*self.main_args, **self. main_kwargs) 文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\autograph\impl\api.py”,第 692 行,在包装器中返回 convert_call(f, args, kwargs, options=options) 文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\autograph\impl\api.py”,第 382 行,converted_call return _call_unconverted(f, args, kwargs,选项)文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\autograph\impl\api.py”,第 463 行,在 _call_unconverted 返回 f(*args, **kwargs) 文件“C:\ Users\mm\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py",第 835 行,在 run_step outputs = model.train_step(data) 显示更多(在文本编辑器中打开原始输出数据) ...

文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\util\dispatch.py​​”,第 206 行,在包装器返回目标(*args,**kwargs)文件“C :\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\ops\nn_impl.py",第 132 行,在 sigmoid_cross_entropy_with_logits 中引发 ValueError("logits 和标签必须具有相同的形状 (%s vs %s)" % ValueError: logits 和标签必须具有相同的形状 ((None, 1) vs (None, 2))

4

1 回答 1

1

通过一些研究和反复试验设法解决了这个问题。主要问题是:

  1. 标签作为元组被馈送到模型中,而不是被分开。当它是多个输出头时,这是必要的:
def process_path(file):
    file = file.numpy()
    file_path = str(bytes.decode(file))
    file = file_path.split("\\")[-1]
    gender, age = get_label(file)
    # Load data from file as a String
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    img = img / 255.0

    return img, gender, age

注意:我对从文件名中提取标签进行了修改,因为它一直没有正确:

file = file_path.split("\\")[-1]
  1. 由于 1 的变化,地图函数需要为另一个标签添加一个额外的 dtype,所以它变成:
train_gen = train_gen.map(lambda x: tf.py_function(process_path, [x], [tf.float32, tf.int32, tf.int32]), num_parallel_calls=AUTOTUNE)
valid_gen = valid_gen.map(lambda x: tf.py_function(process_path, [x], [tf.float32, tf.int32, tf.int32]), num_parallel_calls=AUTOTUNE)
  1. 每个标签都需要重塑:
def _set_shapes(t1, t2, t3):
    t1.set_shape((128,128,3))
    t2.set_shape((1,))
    t3.set_shape((1,))
    t2 = tf.reshape(t2, [-1,1])
    t3 = tf.reshape(t3, [-1,1])
    return (t1,t2,t3)
于 2021-11-12T07:44:29.637 回答