CUDA_ERROR_ILLEGAL_ADDRESS
尝试运行用于计算Buddhabrot 分形轨道的内核时出现异常。
extern "C"
__global__ void exec(int iterations, int size,
float* inputR, float* inputI, // Real/Imaginary input
int* output // Output image in one dimension
) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
float cR = inputR[i];
float cI = inputI[i];
float x = 0;
float y = 0;
float outX[1000];
float outY[1000];
for (int j = 0; j < iterations; j++) {
outX[j] = x;
outY[j] = y;
float xNew = (x * x) - (y * y) + cR;
float yNew = (2 * x * y) + cI;
if (xNew * xNew + yNew * yNew > 4) {
for (int k = 1; k < j; k++) {
int curX = (outX[k] + 2 ) * size / 4;
int curY = (outY[k] + 2 ) * size / 4;
int idx = curX + size * curY;
output[idx]++; // <- exception here
}
return;
}
x = xNew;
y = yNew;
}
}
我现在已经尝试了多种方法,与我最初的想法相反,该错误甚至似乎都不是源于数组。例如,
output[0] = 0;
会工作得很好。但是,当我尝试调试时idx
(记得我首先认为错误与数组有关),我发现我不能像这样分配 idx
output[0] = idx;
也不在 printf 语句中使用它
if (i == 0) {
printf("%d\n", idx);
}
我已经尝试过同样的方法curX
并且curY
也拒绝工作,但是cR
例如可以正常工作而不会出现任何错误。在最里面的循环内分配的变量似乎有问题(我也不能分配k
),所以我尝试idx
在函数开始时在所有循环之外声明,但无济于事。还是同样的错误。
堆栈跟踪:
Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
at fractal.Buddhabrot.<init>(Buddhabrot.java:96)
at controller.Controller.<init>(Controller.java:10)
at Main.main(Main.java:8)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
常数:
block size 512*1*1
grid size 64 *1*1
iterations 1000
size 256
inputR, inputI length 64*512
output length 256*256
MCVE:
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.*;
import java.io.File;
import java.util.Random;
import static jcuda.driver.JCudaDriver.*;
public class Stackoverflow {
public static final int SIZE = 256;
public static final long NUM_POINTS = 128 * 128 * 128;
public static final int ITERATIONS = 10000;
public static final int BLOCK_SIZE = 512;
public static final int SIM_THREADS = BLOCK_SIZE * 64;
public static final Random random = new Random();
public static void main(String[] args) {
File ptxFile = new File("Buddha.ptx");
setExceptionsEnabled(true);
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFile.getAbsolutePath());
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "exec");
cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);
float[] inR = new float[SIM_THREADS];
float[] inI = new float[SIM_THREADS];
int[] out = new int[SIZE * SIZE];
CUdeviceptr deviceInputR = new CUdeviceptr();
cuMemAlloc(deviceInputR, inR.length * Sizeof.FLOAT);
CUdeviceptr deviceInputI = new CUdeviceptr();
cuMemAlloc(deviceInputI, inI.length * Sizeof.FLOAT);
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, out.length * Sizeof.INT);
for (long i = 0; i < NUM_POINTS; i += SIM_THREADS) {
for (int j = 0; j < SIM_THREADS; j++) {
inR[j] = random.nextFloat() * 4f - 2f;
inI[j] = random.nextFloat() * 4f - 2f;
}
System.out.println("GPU START");
cuMemcpyHtoD(deviceInputR, Pointer.to(inR), inR.length * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputI, Pointer.to(inI), inI.length * Sizeof.FLOAT);
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{ITERATIONS}),
Pointer.to(new int[]{SIZE}),
Pointer.to(deviceInputR),
Pointer.to(deviceInputI),
Pointer.to(deviceOutput)
);
int gridSize = (int) Math.ceil(((double) SIM_THREADS) / BLOCK_SIZE);
cuLaunchKernel(function,
gridSize, 1, 1,
BLOCK_SIZE, 1, 1,
0, null,
kernelParameters, null
);
cuCtxSynchronize();
System.out.println("GPU END");
}
cuMemcpyDtoH(Pointer.to(out), deviceOutput, out.length * Sizeof.INT);
}
}