编辑:在解决方案中编辑并编写了一些代码来帮助人们尝试实现嵌套的 for 循环。
所以我想在 cuda 内核中实现以下嵌套 for 循环:
for (unsigned in=1;in<=N;in++){
for (unsigned ie=1;ie<=N-1;ie+=2){
// do some stuff with x[in and/or ie] and z[in and/or ie]
}
其中 x 和 z 是从主机复制的简单一维数组。
现在,就嵌套 for 循环实现而言,据我所知,这应该是相对简单的(如下面的 idx 和 idy 索引所示,有人可以确认吗?)。我的另一个问题是,如何在不使用任何类型的 if 语句和 % 的情况下将 ie 增加 2,而我的内核中我希望这会影响性能?
__global__ void myKernel(double x, double z, int N){
// define indecies for storing counters
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
// define counters
int in = blockIdx.x * blockDim.x + (threadIdx.x + 1); // 1-based indexing
int ie = (blockIdx.y * blockDim.y + (threadIdx.y)) * 2 + 1;
z[i*N+j] = ie; // stores odd indicies [ie]
// do some stuff with (in-ie), x[ie and/or in], z[ie and/or in]...
}
int main(){
int N = 5; //arbitrary size of arrays
int threadsPerBlock = 8; // semi-arbitrary, max 512 but depends on GPU device
dim3 block(threadsPerBlock, threadsPerBlock/2 + 1); // launch N/2 + 1 threads in the second dimension since we are only interested in odd numbered indices.
dim3 grid( (N-1+threadsPerBlock)/threadsPerBlock, (N-1+threadsPerBlock)/threadsPerBlock);
std::vector<double> x;
std::vector<double> z;
z.resize(N*N);
x.resize(N);
//copy to device(d_x, d_z);
//call the kernel
myKernel<<<grid, block>>>(d_x, d_z, N);
// retrieve from device
return 0;
}
谢谢