python - 错误轮询事件状态：未能查询事件：CUDA_ERROR_LAUNCH_FAILED：未指定的启动失败

Question

我已经为这个问题苦苦挣扎了五天，并阅读了 StackOverflow 上的几篇文章，但仍然无法清楚地了解如何解决这个问题。解决此问题的人只是建议尝试不同的 NVIDIA 驱动程序版本，直到找到与特定 GPU 卡的 CUDA 版本（主要是 10.1）匹配的幸运版本。

我在一个桌面（Windows 10、64 位操作系统）上有一个NVIDIA GeForce GTX 1015 Ti ，在另一个桌面（Windows 10、64 位系统）上有一个NVIDIA GeForce RTX 2080Ti。我按照TensorFlow官网的硬件要求安装了GPU驱动（1050 Ti GPU试了418.81和457.09版本，2080 Ti GPU试了432.00、457.30版本）、CUDA Toolkit（两个桌面都是10.1）、cuDNN（7.6 .0 对于两个桌面）并PATH最终修改了环境变量。TensorFlow 版本为 2.3.0，Python 版本为 3.7.9。

使用 TensorFlow 网站上的示例代码，这对 MNIST 训练数据集很有效。但是当我运行一些自定义代码（我有一个从 Keras.Model 继承的自定义模型）时，我总是在两台 PC 上遇到以下错误：

我没有将 TensorFlow 用于传统的神经网络训练，而只是利用自动微分机制来解决优化问题。

我认为我的自定义代码没有问题，因为它在Google Colab上运行良好。同样的代码在我朋友的 Linux 系统上运行良好。

重现错误的代码（在 Google Colab 上运行没有问题）：

# -*- coding: utf-8 -*-
## This code runs well in the Google Colab GPU runtime
## Yuanhang Zhang & Zheyuan Zhu, 12/1/2020, CREOL, UCF, Copyright reserved
## please contact yuanhangzhang@knights.ucf.edu if you want to use the code for research or publications
## all length units are in mm

import tensorflow as tf
import numpy as np
print('tensorflow version:',tf.__version__)

#%% ASM method
dx=np.float32(5e-3) # pixel size
N_obj= 64 # 512 

def tf_fft2d(x):
    with tf.name_scope('tf_fft2d'): # add name_scope, check in tensorboard
      x_shift = tf.signal.ifftshift(x)
      x_fft=tf.signal.fft2d(x_shift)
      y = tf.signal.fftshift(x_fft)
      return y

def tf_ifft2d(x):
    with tf.name_scope('tf_ifft2d'):
      x_shift = tf.signal.ifftshift(x)
      x_ifft=tf.signal.ifft2d(x_shift)
      y = tf.signal.fftshift(x_ifft)
      return y

# angular spectrum method (ASM), not band-limited
# @tf.function
def prop_ASM(Ein,z,wavelength,N_obj,dx):
    freq_obj = np.arange(-N_obj//2,N_obj//2,1)*(1/(dx*N_obj))
    kx = 2*np.pi*freq_obj
    ky = kx.copy()
    KX,KY = np.meshgrid(kx,ky)
    k0 = 2*np.pi/wavelength
    KZ_square = k0**2-KX**2-KY**2
    KZ_square[KZ_square<0] = 0
    Q = np.exp(-1j*z*np.sqrt(KZ_square)) # transfer function of freespace
    with tf.name_scope('prop_ASM'):
      FFT_obj = tf_fft2d(Ein)
      Q_tf = tf.constant(Q,dtype=tf.complex64)
      Eout = tf_ifft2d(FFT_obj*Q_tf)
      return Eout

print('N_obj:',N_obj)

import matplotlib.pyplot as plt
import shutil
shutil.rmtree('__pycache__',ignore_errors=True) # Delete an entire directory tree
import os
os.environ["CUDA_VISIBLE_DEVICES"]='0' 

save_model_path='./models' 
save_mat_folder='./results' 
log_path='./tensorboard_log' # path to log training process
load_model_path = save_model_path

#%% inputs/ouputs for the optimization
x = (np.arange(N_obj,dtype = np.float32)-N_obj/2)*dx
y = (np.arange(N_obj,dtype = np.float32)-N_obj/2)*dx
x_c, y_c = np.meshgrid(x,y)

# input: Gaussian mode
e_in = np.zeros((N_obj, N_obj),dtype = np.float32)  # initialize input field
w_in = np.float32(5e-2)   # beam width

e = np.exp(-((x_c)**2+(y_c)**2)/w_in**2) # Gaussian beam spots array
I = np.sum(np.abs(e)**2)
e_in = e/np.sqrt(I) # normalize power

fig, ax = plt.subplots()
im=ax.imshow(e_in)
cbar=plt.colorbar(im)  
print('e_in shape:',e_in.shape)

# output: Hermite mode
e_out = np.zeros((N_obj, N_obj),dtype = np.float32)
w_out = np.float32(5e-2) # 30e-2
c = np.array([[0,0],[0,1]])
e = np.polynomial.hermite.hermgrid2d(np.sqrt(2)*x/w_out, np.sqrt(2)*y/w_out, c)*np.exp(-(x_c**2+y_c**2)/w_out**2)
e = np.float32(e)
I = np.sum(np.abs(e)**2)
e_out = e/np.sqrt(I) # power normalized

fig, ax = plt.subplots()
im=ax.imshow(e_out)
cbar=plt.colorbar(im)

print('e_out shape:',e_out.shape)

#%% optimization by GradientTape
z = 20 # propagating distance
lambda_design_list = np.array([1.550e-3],dtype = np.float32)

Ein = tf.constant(e_in, name = 'Ein', dtype = tf.complex64) # a 2D tensor
Eout = tf.constant(e_out, name = 'Eout', dtype = tf.complex64)

phi1 = tf.Variable(np.float32(np.ones((N_obj,N_obj))),name='phi1') # dtype: float32
phi2 = tf.Variable(np.float32(np.ones((N_obj,N_obj))),name='phi2')


def forward_propagate(Ein,z,lambda_design_list,N_obj,dx):
    E1_1 = prop_ASM(Ein,z,lambda_design_list[0],N_obj,dx) # used tf.signal.fft2d
    E1_mod_1 = E1_1*tf.exp(tf.complex(real=tf.zeros_like(phi1,dtype='float32'),imag=phi1))
    # E1_mod_1 = tf.math.multiply(E1_1,tf.exp(1j*phi1)) # element-wise muliply ?? not working !!
    E2_1 = prop_ASM(E1_mod_1,z,lambda_design_list[0],N_obj,dx)
    E2_mod_1 = E2_1*tf.exp(tf.complex(real=tf.zeros_like(phi2,dtype='float32'),imag=phi2)) 
    E_out = prop_ASM(E2_mod_1,z,lambda_design_list[0],N_obj,dx)
    # E_out = tf.math.multiply(E2_1,tf.exp(1j*phi2))
    return E_out

def loss_single(E_out, Eout): 
    coupling_eff = tf.sqrt(
        (tf.square(tf.reduce_sum(tf.math.real(E_out)*tf.math.real(Eout)+tf.math.imag(E_out)*tf.math.imag(Eout))) +
         tf.square(tf.reduce_sum(tf.math.imag(E_out)*tf.math.real(Eout)-tf.math.real(E_out)*tf.math.imag(Eout))) ))
    # or something simpler:
    # coupling_eff = tf.abs(tf.reduce_sum((tf.math.multiply(E_out,Eout))))
    loss = - coupling_eff
    return loss

variables = [phi1, phi2] # write variables in a list to optimize

# define optimizer
optimizer =  tf.keras.optimizers.Adam(learning_rate= 1e-2)
epoch_num = 20

for ii in tf.range(epoch_num):
  with tf.GradientTape() as tape:
    # this forward_propagate() function must be in the tape context! otherwise grads is None !!
    # the tape need to record the complete forward propagation 
    E_out = forward_propagate(Ein,z,lambda_design_list,N_obj,dx) 
    loss = loss_single(E_out, Eout)  
    tf.print('ii =:',ii,'coupling_eff =:',-loss)
    # print('watched variables in tape:',[var.name for var in tape.watched_variables()])

  # print("\n ===== calculate gradients now ====ERROR in NEXT LINE!!======\n\n")
  grads = tape.gradient(loss, variables) ## auto-differentiation
  # print(grads)

  # TensorFlow will update parameters automatically
  optimizer.apply_gradients(grads_and_vars=zip(grads, variables))

内核死在grads = tape.gradient(loss, variables)

两台电脑的错误：

2020-11-29 20:41:57.457271: E tensorflow/stream_executor/cuda/cuda_event.cc:29] Error polling for event status: failed to query event: CUDA_ERROR_LAUNCH_FAILED: unspecified launch failure
2020-11-29 20:41:57.457480: F tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc:220] Unexpected Event status: 1
[I 20:42:05.512 NotebookApp] KernelRestarter: restarting kernel (1/5), keep random ports

谁能告诉我如何解决这个问题？盲目地尝试不同版本的驱动程序是使其工作的唯一方法吗？

奇怪的是，如果我在 PC 上使用 Keras API这个例子运行神经网络训练，就没有这样的错误。如果我用这个线性回归示例GradientTape编写一些非常简单的代码来计算梯度，也没有错误......这样，似乎驱动程序安装正确......真的很混乱

python - 错误轮询事件状态：未能查询事件：CUDA_ERROR_LAUNCH_FAILED：未指定的启动失败

0 回答 0

Related

Reference