我需要转置一个方阵。我用矩阵测试程序:a[i][j] = 0 if i>j, a[i][j] = if i<=j,
但结果表明并非所有元素都在正确的位置。
这是代码(main() 除外):
#include <stdio.h>
#include <stdlib.h>
__global__ void transpose_kernel (float *a, float *b, int n) {
unsigned int ax = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int ay = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int aIdx = ax + n * ay;
unsigned int bIdx = ay + n * ax;
b[bIdx] = a[aIdx];
}
int transpose_host (float *a, float *b, int n) {
int size = n * n * sizeof (float);
float *aDev = NULL, *bDev = NULL;
cudaError_t cuerr = cudaMalloc ((void**)&aDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&bDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
dim3 blockSize = dim3 (16, 16, 1);
dim3 gridSize = dim3 (n/16 + 1, n/16 + 1, 1);
cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
transpose_kernel <<< gridSize, blockSize >>> (aDev, bDev, n);
cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMemcpy (b, bDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from b to bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaFree (aDev);
cudaFree (bDev);
return (0);
}
为什么我的数组转置不正确?