在这个简短的示例中,我试图struct在 cuda 设备内存中传递一个带有 init 指针的表。复制到主机-> 设备,设备-> 主机似乎有效,但在`_ global _ function nothing works. Values fordA` 中为空,我无法更改它们。
我不知道如何将值从Ato 复制到dA. 如果我使用这样的基本表,      fcomplex A[N][N]它可以工作,但这不是我想要做的。这是代码:
#include<assert.h>
#include <cuda.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#define N 5// side of matrix containing data
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
typedef struct {float re,im;} fcomplex; 
__global__ void kernel(fcomplex * da)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    int i = (N*y) + x;
    //da[i].re += 2;
    printf("%f \n",da[i].re);
}
int main(int argc, char * argv[])
{
 fcomplex  *dA,**A,**B;
 A= (fcomplex **)malloc(N * sizeof(fcomplex* ));
 B=(fcomplex **)malloc(N * sizeof( fcomplex*  ));
 for (int i = 0; i < N; i++){
    A[i] = (fcomplex *)malloc(N * sizeof(fcomplex ));
    B[i] = (fcomplex *)malloc(N * sizeof(fcomplex ));
 }
 for (int i = 0; i < N; i++)
 {   for (int d= 0; d < N; d++)
     {
    A[i][d].re = i*d;
    A[i][d].im = i*d;
     }
 }
 checkCudaErrors(cudaMalloc((void **)&dA, (size_t)(sizeof(fcomplex)*N*N)));
 checkCudaErrors(cudaMemcpy(dA,A,N*N*sizeof(fcomplex),cudaMemcpyHostToDevice));
 const dim3 blockSize(N,N);
 const dim3 gridSize(1,1);
 kernel<<<gridSize,blockSize>>>(dA);
 checkCudaErrors(cudaThreadSynchronize());
 checkCudaErrors(cudaGetLastError());
 checkCudaErrors(cudaMemcpy(B, dA, sizeof(fcomplex)*N*N, cudaMemcpyDeviceToHost));
 for (int i = 0; i < N; i++)
 {  for (int d= 0; d < N; d++)
    {
     printf("%f-%f\n",A[i][d].re,B[i][d].re);
     printf("%f-%f\n",A[i][d].im,B[i][d].im);
    }
 }
 //verify(A,B,N);
 free(A);
 free(B);
 cudaFree(dA);
 //cudaFree(dB);
}
void verify(fcomplex ** A, fcomplex ** B, int size)
{
 for (int i = 0; i < size; i++)
 {  for (int d= 0; d < size; d++)
    {
     assert(A[i][d].re==B[i][d].re);
    }
 }
 printf("Correct!");
}