1

我的电脑安装了 gpu GeForce 940MX。它的内存带宽为 16.02 GB/s。我正在尝试使用以下代码使用 UNET 模型训练 LUNA 数据集。

from __future__ import print_function

import numpy as np
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from keras.layers import concatenate
from keras.optimizers import Adam
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras import backend as K


K.set_image_dim_ordering('th')  # Theano dimension ordering in this code

img_rows = 512
img_cols = 512

smooth = 1.


def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)

def dice_coef_np(y_true,y_pred):
    y_true_f = y_true.flatten()
    y_pred_f = y_pred.flatten()
    intersection = np.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (np.sum(y_true_f) + np.sum(y_pred_f) + smooth)

def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)


def get_unet():
    inputs = Input((1,img_rows, img_cols))
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool3)
    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)

    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool4)
    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv5)

    #up6 = merge([UpSampling2D(size=(2, 2))(conv5), conv4], mode='concat', concat_axis=1)
    up6 = concatenate([UpSampling2D(size=(2, 2))(conv5), conv4], axis=1)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv6)

    #up7 = merge([UpSampling2D(size=(2, 2))(conv6), conv3], mode='concat', concat_axis=1)
    up7 = concatenate([UpSampling2D(size=(2, 2))(conv6), conv3], axis=1)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv7)

    #up8 = merge([UpSampling2D(size=(2, 2))(conv7), conv2], mode='concat', concat_axis=1)
    up8 = concatenate([UpSampling2D(size=(2, 2))(conv7), conv2], axis=1)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv8)

    #up9 = merge([UpSampling2D(size=(2, 2))(conv8), conv1], mode='concat', concat_axis=1)
    up9 = concatenate([UpSampling2D(size=(2, 2))(conv8), conv1], axis=1)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv9)

    conv10 = Conv2D(1, (1, 1), activation='sigmoid')(conv9)

    model = Model(inputs=inputs, outputs=conv10)

    model.compile(optimizer=Adam(lr=1.0e-5), loss=dice_coef_loss, metrics=[dice_coef])

    return model


def train_and_predict(use_existing):
    print('-'*30)
    print('Loading and preprocessing train data...')
    print('-'*30)
    imgs_train = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"trainImages.npy").astype(np.float32)
    imgs_mask_train = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"trainMasks.npy").astype(np.float32)

    imgs_test = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"testImages.npy").astype(np.float32)
    imgs_mask_test_true = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"testMasks.npy").astype(np.float32)

    mean = np.mean(imgs_train)  # mean for data centering
    std = np.std(imgs_train)  # std for data normalization

    imgs_train -= mean  # images should already be standardized, but just in case
    imgs_train /= std

    print('-'*30)
    print('Creating and compiling model...')
    print('-'*30)
    model = get_unet()
    # Saving weights to unet.hdf5 at checkpoints
    model_checkpoint = ModelCheckpoint('unet.hdf5', monitor='loss', save_best_only=True)
    #
    # Should we load existing weights? 
    # Set argument for call to train_and_predict to true at end of script
    if use_existing:
        model.load_weights('./unet.hdf5')

    # 
    # The final results for this tutorial were produced using a multi-GPU
    # machine using TitanX's.
    # For a home GPU computation benchmark, on my home set up with a GTX970 
    # I was able to run 20 epochs with a training set size of 320 and 
    # batch size of 2 in about an hour. I started getting reseasonable masks 
    # after about 3 hours of training. 
    #
    print('-'*30)
    print('Fitting model...')
    print('-'*30)
    model.fit(imgs_train, imgs_mask_train, batch_size=50, epochs=10, verbose=1, shuffle=True,
              callbacks=[model_checkpoint])

    # loading best weights from training session
    print('-'*30)
    print('Loading saved weights...')
    print('-'*30)
    model.load_weights('./unet.hdf5')

    print('-'*30)
    print('Predicting masks on test data...')
    print('-'*30)
    num_test = len(imgs_test)
    imgs_mask_test = np.ndarray([num_test,1,512,512],dtype=np.float32)
    for i in range(num_test):
        imgs_mask_test[i] = model.predict([imgs_test[i:i+1]], verbose=0)[0]
    np.save('masksTestPredicted.npy', imgs_mask_test)
    mean = 0.0
    for i in range(num_test):
        mean+=dice_coef_np(imgs_mask_test_true[i,0], imgs_mask_test[i,0])
    mean/=num_test
    print("Mean Dice Coeff : ",mean)

if __name__ == '__main__':
    train_and_predict(False)

但是当使用 GPU 运行它时,出现以下错误。

Warning (from warnings module):
  File "C:\Research\Python_installation\lib\site-packages\h5py\__init__.py", line 36
    from ._conv import register_converters as _register_converters
FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
Using TensorFlow backend.
------------------------------
Loading and preprocessing train data...
------------------------------
------------------------------
Creating and compiling model...
------------------------------
------------------------------
Fitting model...
------------------------------
Epoch 1/10
Traceback (most recent call last):
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1327, in _do_call
    return fn(*args)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1306, in _run_fn
    status, run_metadata)
  File "C:\Research\Python_installation\lib\contextlib.py", line 66, in __exit__
    next(self.gen)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 466, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[50,32,512,512]
     [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
     [[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 150, in <module>
    train_and_predict(False)
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 127, in train_and_predict
    callbacks=[model_checkpoint])
  File "C:\Research\Python_installation\lib\site-packages\keras\engine\training.py", line 1657, in fit
    validation_steps=validation_steps)
  File "C:\Research\Python_installation\lib\site-packages\keras\engine\training.py", line 1213, in _fit_loop
    outs = f(ins_batch)
  File "C:\Research\Python_installation\lib\site-packages\keras\backend\tensorflow_backend.py", line 2357, in __call__
    **self.session_kwargs)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 895, in run
    run_metadata_ptr)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1124, in _run
    feed_dict_tensor, options, run_metadata)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1321, in _do_run
    options, run_metadata)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[50,32,512,512]
     [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
     [[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'conv2d_1/convolution', defined at:
  File "<string>", line 1, in <module>
  File "C:\Research\Python_installation\lib\idlelib\run.py", line 124, in main
    ret = method(*args, **kwargs)
  File "C:\Research\Python_installation\lib\idlelib\run.py", line 351, in runcode
    exec(code, self.locals)
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 150, in <module>
    train_and_predict(False)
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 106, in train_and_predict
    model = get_unet()
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 39, in get_unet
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
  File "C:\Research\Python_installation\lib\site-packages\keras\engine\topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Research\Python_installation\lib\site-packages\keras\layers\convolutional.py", line 164, in call
    dilation_rate=self.dilation_rate)
  File "C:\Research\Python_installation\lib\site-packages\keras\backend\tensorflow_backend.py", line 3195, in conv2d
    data_format=tf_data_format)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 672, in convolution
    op=op)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 338, in with_space_to_batch
    return op(input, num_spatial_dims, padding)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 664, in op
    name=name)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 131, in _non_atrous_convolution
    name=name)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 397, in conv2d
    data_format=data_format, name=name)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[50,32,512,512]
     [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
     [[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

有人可以请解释一下这个错误背后的原因,ResourceExhaustedError。是不是因为 GPU 的内存不足以加载数据集。这在没有 GPU 的情况下工作得很好。但是花了大约 6 个小时才完成一个 epoch

4

0 回答 0