罗伯特已经圆满地回答了这个问题。我已经实现了下面的代码,显示了完整扭曲的排列。
#include <stdio.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
__global__ void shufflingKernel(double *d_data, double *d_result, int *d_perm){
unsigned mask = __activemask();
int tid = threadIdx.x;
int srcLane = d_perm[tid];
double var = d_data[tid];
//d_result[tid] = __shfl_sync(0xFFFFFFFF, var, srcLane);
d_result[tid] = __shfl_sync(mask, var, srcLane);
}
int main(){
const int N = 32;
double h_data[32] = { 3.4, 42.2, 2., -1., 10., 11., 2., -1., 10., 33., 2.3, 11., 44., 0., -33., -21.,
4.4, 43.2, 3., -2., 13., 15., 222., -90., 17., 30., 11.3, 7., 22., 100., -30., -91. };
double *h_result = (double *)malloc(N * sizeof(double));
int h_perm[32] = { 6, 11, 9, 2, 5, 23, 31, 0, 3, 27, 29, 1, 28, 30, 17, 13, 10, 8, 4, 22, 7, 18, 24, 12, 20,
19, 16, 26, 21, 15, 25, 14 };
int *d_perm; gpuErrchk(cudaMalloc(&d_perm, N * sizeof(int)));
double *d_data; gpuErrchk(cudaMalloc(&d_data, N * sizeof(double)));
double *d_result; gpuErrchk(cudaMalloc(&d_result, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_perm, &h_perm[0], N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data, &h_data[0], N * sizeof(double), cudaMemcpyHostToDevice));
shufflingKernel << <1, 32>> >(d_data, d_result, d_perm);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_result, d_result, N * sizeof(double), cudaMemcpyDeviceToHost));
for (int k = 0; k < N; k++) {
printf("k = %d; Original = %f; New = %f; Check = %f\n", k, h_data[k], h_result[k], h_data[h_perm[k]]);
}
}
请注意,在 CUDA 中的 Shuffle 指令不工作的意义上,0xFFFFFFFF
使用 warp 级原语而不是使用活动线程的掩码更安全。__activemask()