0

请看这段代码:

#include <stdlib.h>
#include <stdio.h>

int N, L, I;
float * inputs;
float * temp;

// first kernel
__global__ void mulKernel ( float * output, float * inputs)///, float * weights)
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   output [idx] = inputs [idx] * 3;//weights [idx];
   //weights [idx] = 4;

   //__syncthreads();
}

//second kernel
__global__ void sumKernel ( float * output, float * input)
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   output [idx] = input[idx]*2;

   __syncthreads();
}

void printVector (const float *p, const int N) {
    for (int i=0; i<N; i++)
    printf("%f\n",p[i]);
}

int main(int argc, char *argv[])
{
    if(argc < 3)
        printf("Usage: cuda <layers> <inputs>\n");
    else
    {
        L = atoi(argv[1]);
        N = atoi(argv[2]);
        I = atoi(argv[2]);
        inputs = (float*)malloc(I*sizeof(float));
        float * weights = (float*)malloc(I*sizeof(float));

        // and fill with some arbitrary values
        for (int i=0; i<I; i++)
        {
            inputs[i] = 1;
        }
        for (int i=0; i<I; i++)
        {
            weights[i] = 1.5;
        }

        // allocate device memory
        float * devInputs = NULL;
        float * devTemp = NULL;
        float * devWeights = NULL;

        cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
        cudaMalloc ( (void**)&devTemp, I*sizeof(float) );
        cudaMalloc ( (void**)&devWeights, I*sizeof(float) );

        // set kernel launch configuration
        dim3 threadsMul = dim3(512, 1);
        int blocksCount = floor(I / threadsMul.x) + 1;
        dim3 blocksMul  = dim3(blocksCount, 1);

        dim3 threadsSum = dim3(512, 1);
        blocksCount = floor(I / threadsSum.x) + 1;
        dim3 blocksSum  = dim3(blocksCount, 1);

        cudaMemcpy      ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
        cudaMemcpy      ( devWeights, weights,I*sizeof(float), cudaMemcpyHostToDevice );

        //kernels calling in this cycle
        for(int j=0;j<L;j++)
        {
            // copying data to see that's ok
          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

            // print it
          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

            // running first kernel
          mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs);//, devWeights);

            // copying and printing data. We can see thats array weights contains a wrong values
          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

          if(cudaDeviceSynchronize() == cudaSuccess)
            printf("threads syncronized\n");

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

          sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp);

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n\n");

          if(cudaDeviceSynchronize() == cudaSuccess)
            printf("threads syncronized\n");

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n\n");
        }

        cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );

        cudaFree         ( devInputs   );
        cudaFree         ( devTemp   );
        cudaFree         ( devWeights   );

        printVector (inputs, N);

        free(inputs);
        free(weights);
    }
    return 0;
}

并查看输出。在调用第一个内核之后,devWeights 数组丢失了数据。但它没有在任何地方使用。我只是将它复制到内存,运行内核(不影响它)并复制回主机。在输出中,我看到它发生了变化。为什么?我究竟做错了什么?

在 main 函数中,您可以看到循环。在其中我运行两个内核:sumKernel 和 mulKernel。在运行内核之前、之后和同步线程之后,我将数组复制到主机并打印它。所以,我在调用内核后看到了错误的数据。请参阅代码中的注释。

我没有看到任何错误(只有 cudaSuccess)。

4

1 回答 1

0

Oh, I found the error. I forgot to use if(idx < N) in my kernels and CUDA didn't print error when gone out array dimensions. So, when I changed inputs array, I also changed data that situated in memory after inputs.

于 2012-05-01T13:52:01.510 回答