请看这段代码:
#include <stdlib.h>
#include <stdio.h>
int N, L, I;
float * inputs;
float * temp;
// first kernel
__global__ void mulKernel ( float * output, float * inputs)///, float * weights)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
output [idx] = inputs [idx] * 3;//weights [idx];
//weights [idx] = 4;
//__syncthreads();
}
//second kernel
__global__ void sumKernel ( float * output, float * input)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
output [idx] = input[idx]*2;
__syncthreads();
}
void printVector (const float *p, const int N) {
for (int i=0; i<N; i++)
printf("%f\n",p[i]);
}
int main(int argc, char *argv[])
{
if(argc < 3)
printf("Usage: cuda <layers> <inputs>\n");
else
{
L = atoi(argv[1]);
N = atoi(argv[2]);
I = atoi(argv[2]);
inputs = (float*)malloc(I*sizeof(float));
float * weights = (float*)malloc(I*sizeof(float));
// and fill with some arbitrary values
for (int i=0; i<I; i++)
{
inputs[i] = 1;
}
for (int i=0; i<I; i++)
{
weights[i] = 1.5;
}
// allocate device memory
float * devInputs = NULL;
float * devTemp = NULL;
float * devWeights = NULL;
cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
cudaMalloc ( (void**)&devTemp, I*sizeof(float) );
cudaMalloc ( (void**)&devWeights, I*sizeof(float) );
// set kernel launch configuration
dim3 threadsMul = dim3(512, 1);
int blocksCount = floor(I / threadsMul.x) + 1;
dim3 blocksMul = dim3(blocksCount, 1);
dim3 threadsSum = dim3(512, 1);
blocksCount = floor(I / threadsSum.x) + 1;
dim3 blocksSum = dim3(blocksCount, 1);
cudaMemcpy ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy ( devWeights, weights,I*sizeof(float), cudaMemcpyHostToDevice );
//kernels calling in this cycle
for(int j=0;j<L;j++)
{
// copying data to see that's ok
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
// print it
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n");
// running first kernel
mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs);//, devWeights);
// copying and printing data. We can see thats array weights contains a wrong values
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n");
if(cudaDeviceSynchronize() == cudaSuccess)
printf("threads syncronized\n");
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n");
sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp);
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n\n");
if(cudaDeviceSynchronize() == cudaSuccess)
printf("threads syncronized\n");
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n\n");
}
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaFree ( devInputs );
cudaFree ( devTemp );
cudaFree ( devWeights );
printVector (inputs, N);
free(inputs);
free(weights);
}
return 0;
}
并查看输出。在调用第一个内核之后,devWeights 数组丢失了数据。但它没有在任何地方使用。我只是将它复制到内存,运行内核(不影响它)并复制回主机。在输出中,我看到它发生了变化。为什么?我究竟做错了什么?
在 main 函数中,您可以看到循环。在其中我运行两个内核:sumKernel 和 mulKernel。在运行内核之前、之后和同步线程之后,我将数组复制到主机并打印它。所以,我在调用内核后看到了错误的数据。请参阅代码中的注释。
我没有看到任何错误(只有 cudaSuccess)。