nvprof 配置文件的 API 就好了。但是说没有内核被分析。它显示这 2 条警告消息“ ==525867== 警告:由于设备缓冲区空间不足,4 条记录的时间戳无效。您可以使用选项 --device-buffer-size 配置缓冲区空间。==525867== 警告:由于信号量池大小不足,1 条记录的时间戳无效。您可以使用选项 --profiling-semaphore-pool-size 配置池大小。==525867== 分析结果:未分析内核。" 我正在使用 NVIDIA GeForce GPU。
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <cuda_profiler_api.h>
__global__ void matrixInit(float *m, int N_1, int N_2, int value){
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int strideX = blockDim.x * gridDim.x;
unsigned int strideY = blockDim.y * gridDim.y;
for(int j=iy; j<N_2; j+=strideY){
for(int i=ix; i<N_1; i+=strideX){
m[j*N_1+i] = value;
}
}
}
__global__ void matrixAdd(float *d_A, float *d_B, float *d_C, int N_1, int N_2){
// indexes and strides in 2d
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int strideX = blockDim.x * gridDim.x;
unsigned int strideY = blockDim.y * gridDim.y;
for(int j=iy; j<N_2; j+=strideY){
for(int i=ix; i<N_1; i+=strideX){
d_C[i] = d_A[j*N_1+i]+d_B[j*N_1+i];
}
}
}
int main() {
int N_1 = 1 << 12;
int N_2 = 1 << 15;
//Size
int N_1_2 = N_1 * N_2;
// host memory pointers
float *A, *B, *C;
// device memory pointers
float *d_A, *d_B, *d_C;
clock_t t = clock();
size_t bytes = N_1_2*sizeof(float);
// allocate host memory
A = (float*)malloc(bytes);
B = (float*)malloc(bytes);
C = (float*)malloc(bytes);
//set dimensions for 1d
int threadsPerBlock=32;
dim3 threads(threadsPerBlock,threadsPerBlock);
dim3 numBlocks( N_1/threads.x, N_2/threads.y);
printf(" Grid Size of X: %d Grid Size of Y: %d \n ",threads.x,threads.y);
//Initialize
matrixInit<<<numBlocks,threads>>>(A,N_1, N_2, 1.0f);
matrixInit<<<numBlocks,threads>>>(B,N_1, N_2, 2.0f);
matrixInit<<<numBlocks,threads>>>(C,N_1, N_2, 0.0f);
//allocated device memory
cudaMalloc(&d_A, bytes);
cudaMalloc(&d_B, bytes);
cudaMalloc(&d_C, bytes);
//copy to device
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
matrixAdd<<<numBlocks,threads>>>(d_A, d_B, d_C, N_1, N_2);
//copy back to host
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
t = clock() - t;
printf("Program executed at %f seconds", ((float)t) / CLOCKS_PER_SEC);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaProfilerStop();
return 0;
}
在 cuda c 中使用矩阵加法,代码执行但在使用 nvprof 对其进行分析时。它说没有内核分析。