我试图总结两个矩阵 a_h_1 和 a_h_2,并将结果写回 a_h_1。但是由于某种原因,我的内核函数不会更改除前 N 个元素之外的数组成员。例如,即使我写了 a[8] = 45,当它被复制回主机时,它也会打印为 8。怎么了?
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void matrix_summation(float *a, float *b, int M, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<M*N)
{
a[idx] = blockIdx.x;
}
}
// main routine that executes on the host
int main(void)
{
float *a_h_1,*a_h_2, *a_d_1,*a_d_2; // Pointer to host & device arrays
const int N = 5;
const int M = 5;
// Number of elements in arrays
size_t size = (N * M) * sizeof(float);
a_h_1 = (float *)malloc(size); // Allocate array1 on host
a_h_2 = (float *)malloc(size); // Allocate array2 on host
cudaMalloc((void **) &a_d_1, size); // Allocate array1 on device
cudaMalloc((void **) &a_d_2, size); // Allocate array2 on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N*M; i++){
a_h_1[i] = (float)i;
a_h_2[i] = (float)i;
}
cudaMemcpy(a_d_1, a_h_1, size, cudaMemcpyHostToDevice);
cudaMemcpy(a_d_2, a_h_2, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = M;
int n_blocks = (M*N)/block_size;
matrix_summation <<< n_blocks, block_size >>> ( a_d_1,a_d_2, M, N));
// Retrieve result from device and store it in host array
cudaMemcpy(a_h_1, a_d_1, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
printf("\n\nROW 1 \n");
for (int i=0; i<(M*N); i++)
{
printf(" %f ", a_h_1[i]);
if((i+1)%N == 0)
{
printf("\nROW %d \n", ((i+1)/N)+1);
}
}
// Cleanup
free(a_h_1);
free(a_h_2);
cudaFree(a_d_1);
cudaFree(a_d_2);
system("pause");
}
这是输出:
ROW 1
0.0 2.0 4.0 6.0 8.0 < this line is correct but others are not
ROW 2
5.0 6.0 7.0 8.0 9.0
ROW 3
10.0 11.0 12.0 13.0 14.0
ROW 4
15.0 16.0 17.0 18.0 19.0
ROW 5
20.0 21.0 22.0 23.0 24.0