我正在尝试在全局功能上调用 1000000 或更长时间的设备功能。但是,我总是遇到以下错误:Microsoft C++ exception: cudaError_enum at memory location 0x0031fc24 但是代码很简单。从线程设备返回到线程主机的执行线程有可能异步锁定一些资源吗?正如我们所见,变量中没有溢出,那么发生了什么?
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include "cuda.h"
#include "curand_kernel.h"
#define NDIM 30
#define NPAR 3
#define DIMPAR NDIM*NPAR //
__device__ float f(float *inputs){
float t = 0.0;
int i;
for(i = 0 ; i < 15; i++)
t+= inputs[i]*0.0001;
return t;
}
__global__ void kernel(float *pos, float *pbest){
int thread = threadIdx.x + blockDim.x * blockIdx.x;
int i = 0;
float tpbest = 0.0;
if(thread < DIMPAR){
do{
tpbest = f(pbest);
i++;
}while(i < 1000000); //max length int 2147483648 > 1000000
}
}
int main(int argc, char *argv[])
{
float *d_pos, *h_pos;
float *d_pbest, *h_pbest;
h_pos = ( float *) malloc(sizeof( float ) * DIMPAR);
h_pbest = ( float *) malloc(sizeof( float ) * DIMPAR);
cudaMalloc((void**)&d_pos, DIMPAR * sizeof( float ));
cudaMalloc((void**)&d_pbest, DIMPAR * sizeof( float ));
int i, numthreadsperblock, numblocks;
numthreadsperblock = 512;
numblocks = (DIMPAR / numthreadsperblock) + ((DIMPAR % numthreadsperblock)?1:0);
printf("numthreadsperblock: %i;; numblocks:%i\n", numthreadsperblock, numblocks);
//fill in host code
for(i = 0 ; i < DIMPAR ; i++){
h_pos[i] = 1;
h_pbest[i] = 1;
}
//transf. to device memory
cudaMemcpy(d_pos, h_pos, DIMPAR * sizeof( float ), cudaMemcpyHostToDevice);
cudaMemcpy(d_pbest, h_pbest, DIMPAR * sizeof( float ), cudaMemcpyHostToDevice);
kernel<<<numblocks,numthreadsperblock>>>(d_pos, d_pbest);
cudaMemcpy(h_pos, d_pos, DIMPAR * sizeof( float ), cudaMemcpyDeviceToHost);
return 0;
}