我正在训练包含 21000 张图像的图像分类模型。我在 tensorflow 的 tf.data API 的帮助下创建了数据管道。我的问题是尽管使用了 API,但训练速度太慢了。我还启用了 tensorflow gpu 版本。请帮帮我。我首先认为这是由于 keras imagedatagenerator 减慢了我的训练时间,但现在当我更改它时,tf.data 管道它仍然没有使用我的 gpu。下面是我的整个代码
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50, EfficientNetB3, InceptionV3, DenseNet121
from tensorflow.keras.optimizers import Adam
# ignoring warnings
import warnings
warnings.simplefilter("ignore")
import os,cv2
base_dir = "D:/cassava-leaf-disease-classification/"
train_csv = pd.read_csv("D:/cassava-leaf-disease-classification/train.csv")
# print(train_csv.head())
df_sample = pd.read_csv("D:/cassava-leaf-disease-classification/sample_submission.csv")
train_images = "D:/cassava-leaf-disease-classification/train_images/"+train_csv['image_id']
# print(train_images)
# print(os.listdir(train_images))
train_labels = pd.read_csv(os.path.join(base_dir, "train.csv"))
# print(train_labels)
BATCH_SIZE = 16
EPOCHS = 25
STEPS_PER_EPOCH = len(train_labels)*0.8 / BATCH_SIZE
TARGET_SIZE = 300
# train_labels['label'] = train_labels.label.astype('str')
labels = train_labels.iloc[:,-1].values
# print(labels)
def build_decoder(with_labels=True, target_size=(TARGET_SIZE, TARGET_SIZE), ext='jpg'):
def img_decode(img_path):
file_bytes = tf.io.read_file(img_path)
if ext == 'png':
img = tf.image.decode_png(file_bytes, channels=3)
elif ext in ['jpg', 'jpeg']:
img = tf.image.decode_jpeg(file_bytes, channels=3)
else:
raise ValueError("Image extension not supported")
img = tf.cast(img, tf.float32) / 255.0
img = tf.image.resize(img, target_size)
return img
def decode_with_labels(img_path, label):
return img_decode(img_path), label
if with_labels == True:
return decode_with_labels
else:
return img_decode
def build_augmenter(with_labels=True):
def augment(img):
img = tf.image.random_flip_left_right(img)
img = tf.image.random_flip_up_down(img)
img = tf.image.random_brightness(img, 0.1)
img = tf.image.random_contrast(img, 0.9, 1.1)
img = tf.image.random_saturation(img, 0.9, 1.1)
return img
def augment_with_labels(img, label):
return augment(img), label
if with_labels == True:
return augment_with_labels
else:
return augment
def build_dataset(paths, labels=None, bsize=32, cache=True,
decode_fn=None, augment_fn=None,
augment=True, repeat=True, shuffle=1024,
cache_dir=""):
if cache_dir != "" and cache is True:
os.makedirs(cache_dir, exist_ok=True)
if decode_fn is None:
decode_fn = build_decoder(labels is not None)
if augment_fn is None:
augment_fn = build_augmenter(labels is not None)
AUTO = tf.data.experimental.AUTOTUNE
slices = paths if labels is None else (paths, labels)
dset = tf.data.Dataset.from_tensor_slices(slices)
dset = dset.map(decode_fn, num_parallel_calls=AUTO)
# dset = dset.cache(cache_dir) if cache else dset
dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
dset = dset.repeat() if repeat else dset
dset = dset.shuffle(shuffle) if shuffle else dset
dset = dset.batch(bsize).prefetch(AUTO)
return dset
# Train test split
(train_img, valid_img,train_labels,valid_labels) = train_test_split(train_images,labels,train_size = 0.8,random_state = 0)
# print(train, valid)
# Tensorflow datasets
train_df = build_dataset(
train_img, train_labels, bsize=BATCH_SIZE,
cache=True)
valid_df = build_dataset(
valid_img, valid_labels, bsize=BATCH_SIZE,
repeat=False, shuffle=False, augment=False,
cache=True)
def create_model():
model = models.Sequential()
model.add(EfficientNetB3(include_top=False, weights='imagenet',
input_shape=(TARGET_SIZE,TARGET_SIZE,3)))
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(5,activation='softmax'))
model.compile(optimizer=Adam(lr=0.001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
model = create_model()
model.summary()
model_save = ModelCheckpoint('C:/Users/rosha/PycharmProjects/CLDD/saved_Models/EffNetB3_300_16_best_weights.h5',
save_best_only=True,
save_weights_only=True,
monitor='val_accuracy',
mode='max',
verbose=1
)
early_stop = EarlyStopping(monitor='val_accuracy',
min_delta=0.001,
patience=5,
mode='max',
verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy',
factor=0.3,
patience=2,
min_delta=0.001,
mode='max',
verbose=1)
history = model.fit(
train_df,
validation_data=valid_df,
steps_per_epoch=STEPS_PER_EPOCH,
epochs=EPOCHS,
callbacks=[model_save, early_stop, reduce_lr],
verbose=1,
)
plt.rcParams.update({'font.size': 16})
hist = pd.DataFrame(history.history)
fig, (ax1, ax2) = plt.subplots(figsize=(12, 12), nrows=2, ncols=1)
hist['loss'].plot(ax=ax1, c='k', label='training loss')
hist['val_loss'].plot(ax=ax1, c='r', linestyle='--', label='validation loss')
ax1.legend()
hist['accuracy'].plot(ax=ax2, c='k', label='training accuracy')
hist['val_accuracy'].plot(ax=ax2, c='r', linestyle='--', label='validation accuracy')
ax2.legend()
plt.show()
model.save('./EffNetB3_300_16.h5')