我正在尝试将我前段时间在 Colab(使用 ImageDataGenerator)上所做的工作簿转换为使用 tf.data.dataset 的工作簿,因为我现在设置了多 GPU,并且正在尝试学习如何进行更快的训练。该模型在来自 Kaggle 的年龄/性别/种族数据集上进行训练,但在这种情况下,我们只对性别和年龄预测感兴趣。性别将是 0 或 1,损失函数是二元交叉熵,而年龄是 0 到 120 之间的整数,损失函数是 mse,它是回归。
import tensorflow as tf
import os
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 64
#Load datasets from directories
train_gen = tf.data.Dataset.list_files(os.listdir(training_dir), shuffle = False)
valid_gen = tf.data.Dataset.list_files(os.listdir(validation_dir), shuffle = False)
def decode_img(img):
#Convert compressed string into a 3D tensor
img = tf.io.decode_jpeg(img, channels=3)
img = tf.image.convert_image_dtype(img, tf.float32)
#Resize the image to the desired size
return tf.image.resize(img, [128,128])
def get_label(file):
gender = get_sex(file) #returns either 0 or 1
age = get_age(file) #returns interger between 0 and about 120
return gender, age
def process_path(file):
file = file.numpy()
file_path = str(bytes.decode(file))
file = file_path.split(' ')[-1].split("\\")[-1]
labels = get_label(file)
# Load data from file as a String
img = tf.io.read_file(file_path)
img = decode_img(img)
img = img / 255.0
return img, labels
def _set_shapes(t1, t2):
t1.set_shape((128,128,3))
t2.set_shape((2,))
return (t1,t2)
train_gen = train_gen.map(lambda x: tf.py_function(process_path, [x], [tf.float32, tf.int32]), num_parallel_calls=AUTOTUNE)
valid_gen = valid_gen.map(lambda x: tf.py_function(process_path, [x], [tf.float32, tf.int32]), num_parallel_calls=AUTOTUNE)
train_gen = train_gen.map(_set_shapes,num_parallel_calls=AUTOTUNE)
valid_gen = valid_gen.map(_set_shapes, num_parallel_calls=AUTOTUNE)
train_gen = train_gen.batch(batch_size)
valid_gen = valid_gen.batch(batch_size)
train_gen
输出:<BatchDataset 形状:((None, 128, 128, 3), (None, 2)),类型:(tf.float32, tf.int32)>
#configure for performance
def config_for_performance(ds):
ds = ds.cache()
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
train_gen = config_for_performance(train_gen)
valid_gen = config_for_performance(valid_gen)
模型本身:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Dropout, Input, Activation, Flatten, BatchNormalization, PReLU
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import BinaryCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
gpus = tf.config.list_logical_devices('GPU')
#print(gpus)
strategy = tf.distribute.MirroredStrategy(gpus,cross_device_ops=tf.distribute.ReductionToOneDevice())
with strategy.scope():
#Define the convolution layers
inp = Input(shape=(128,128,3))
cl1 = Conv2D(32,(3,3), padding='same', kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(inp)
bn1 = BatchNormalization()(cl1)
pr1 = PReLU(alpha_initializer='he_uniform')(bn1)
cl2 = Conv2D(32,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(pr1)
bn2 = BatchNormalization()(cl2)
pr2 = PReLU(alpha_initializer='he_uniform')(bn2)
mp1 = MaxPool2D((2,2))(pr2)
cl3 = Conv2D(64,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp1)
bn3 = BatchNormalization()(cl3)
pr3 = PReLU(alpha_initializer='he_uniform')(bn3)
cl4 = Conv2D(64,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(pr3)
bn4 = BatchNormalization()(cl4)
pr4 = PReLU(alpha_initializer='he_uniform')(bn4)
mp2 = MaxPool2D((2,2))(pr4)
cl5 = Conv2D(128,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp2)
bn5 = BatchNormalization()(cl5)
pr5 = PReLU(alpha_initializer='he_uniform')(bn5)
mp3 = MaxPool2D((2,2))(pr5)
cl6 = Conv2D(256,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp3)
bn6 = BatchNormalization()(cl6)
pr6 = PReLU(alpha_initializer='he_uniform')(bn6)
mp4 = MaxPool2D((2,2))(pr6)
cl7 = Conv2D(512,(3,3), padding='same',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(mp4)
bn7 = BatchNormalization()(cl7)
pr7 = PReLU(alpha_initializer='he_uniform')(bn7)
mp5 = MaxPool2D((2,2))(pr7)
flt = Flatten()(mp5)
#This layer predicts age
agelayer = Dense(128, activation='relu',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(flt)
agelayer = BatchNormalization()(agelayer)
agelayer = Dropout(0.6)(agelayer)
agelayer = Dense(1, activation='relu', name='age_output', kernel_initializer='he_uniform', dtype='float32')(agelayer)
#This layer predicts gender
glayer = Dense(128, activation='relu',kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(flt)
glayer = BatchNormalization()(glayer)
glayer = Dropout(0.5)(glayer)
glayer = Dense(1, activation='sigmoid', name='gender_output', kernel_initializer='he_uniform', dtype='float32')(glayer)
modelA = Model(inputs=inp, outputs=[glayer,agelayer])
model_folder = 'C:/Users/mm/OneDrive/Documents/Age estimation & gender classification/models'
if not os.path.exists(model_folder):
os.mkdir(model_folder)
#Callback to control learning rate during training. Reduces learning rate by 5% after 3 epochs of no improvement on validation loss
lr_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.95, patience=3,min_lr=0.000005)
#Callback to stop training if after 100 epochs of no improvement it stops and restores the best weights
es_callback = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True, min_delta=0.001)
#Compile Model A
modelA.compile(optimizer='Adam', loss={'gender_output': BinaryCrossentropy(), 'age_output': 'mse'}, metrics={'gender_output': 'accuracy', 'age_output':'mae'})
#Training Model A
history = modelA.fit(train_gen, epochs=100, validation_data=valid_gen, callbacks=[es_callback,lr_callback])
错误信息:
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1' ) Epoch 1/100 INFO:tensorflow:Error 报告给 Coordinator:logits 和标签必须具有相同的形状 ((None, 1) vs (None, 2)) Traceback(最近一次调用最后):文件“C:\Users\ mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\ops\nn_impl.py”,第 130 行,在 sigmoid_cross_entropy_with_logits 标签中。get_shape().assert_is_compatible_with(logits.get_shape()) 文件“C:\Users \mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\framework\tensor_shape.py",第 1161 行,在 assert_is_compatible_with 中引发 ValueError("形状 %s 和 %s 不兼容" % (self, other) ) ValueError: 形状 (None, 2) 和 (None, 1) 不兼容
在处理上述异常的过程中,又出现了一个异常:
Traceback(最近一次调用最后一次):文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\training\coordinator.py”,第 297 行,在 stop_on_exception 产量文件“C: \Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\distribute\mirrored_run.py",第 346 行,运行中 self.main_result = self.main_fn(*self.main_args, **self. main_kwargs) 文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\autograph\impl\api.py”,第 692 行,在包装器中返回 convert_call(f, args, kwargs, options=options) 文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\autograph\impl\api.py”,第 382 行,converted_call return _call_unconverted(f, args, kwargs,选项)文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\autograph\impl\api.py”,第 463 行,在 _call_unconverted 返回 f(*args, **kwargs) 文件“C:\ Users\mm\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py",第 835 行,在 run_step outputs = model.train_step(data) 显示更多(在文本编辑器中打开原始输出数据) ...
文件“C:\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\util\dispatch.py”,第 206 行,在包装器返回目标(*args,**kwargs)文件“C :\Users\mm\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\ops\nn_impl.py",第 132 行,在 sigmoid_cross_entropy_with_logits 中引发 ValueError("logits 和标签必须具有相同的形状 (%s vs %s)" % ValueError: logits 和标签必须具有相同的形状 ((None, 1) vs (None, 2))