0

nvprof 配置文件的 API 就好了。但是说没有内核被分析。它显示这 2 条警告消息“ ==525867== 警告:由于设备缓冲区空间不足,4 条记录的时间戳无效。您可以使用选项 --device-buffer-size 配置缓冲区空间。==525867== 警告:由于信号量池大小不足,1 条记录的时间戳无效。您可以使用选项 --profiling-semaphore-pool-size 配置池大小。==525867== 分析结果:未分析内核。" 我正在使用 NVIDIA GeForce GPU。

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <cuda_profiler_api.h>



__global__ void matrixInit(float *m, int N_1, int N_2, int value){
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int strideX = blockDim.x * gridDim.x;
    unsigned int strideY = blockDim.y * gridDim.y;

    for(int j=iy; j<N_2; j+=strideY){
        for(int i=ix; i<N_1; i+=strideX){
            m[j*N_1+i] = value;
        }
    }
}


__global__ void matrixAdd(float *d_A, float *d_B, float *d_C, int N_1, int N_2){
    // indexes and strides in 2d

    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int strideX = blockDim.x * gridDim.x;
    unsigned int strideY = blockDim.y * gridDim.y;

    for(int j=iy; j<N_2; j+=strideY){
        for(int i=ix; i<N_1; i+=strideX){
            d_C[i] = d_A[j*N_1+i]+d_B[j*N_1+i];
        }
    }
}

int main() {


    int N_1 = 1 << 12;
    int N_2 = 1 << 15;


    //Size

 int N_1_2 = N_1 * N_2;

 // host memory pointers
    float *A, *B, *C;

 // device memory pointers
    float *d_A, *d_B, *d_C;

  clock_t t = clock();

  size_t bytes = N_1_2*sizeof(float);

// allocate host memory
    A = (float*)malloc(bytes);
    B = (float*)malloc(bytes);
    C = (float*)malloc(bytes);


//set  dimensions for 1d

int threadsPerBlock=32;
dim3 threads(threadsPerBlock,threadsPerBlock);
dim3 numBlocks( N_1/threads.x, N_2/threads.y);
printf(" Grid Size of X: %d Grid Size of Y: %d \n ",threads.x,threads.y);


//Initialize
    matrixInit<<<numBlocks,threads>>>(A,N_1, N_2, 1.0f);
    matrixInit<<<numBlocks,threads>>>(B,N_1, N_2, 2.0f);
    matrixInit<<<numBlocks,threads>>>(C,N_1, N_2, 0.0f);



   //allocated device memory


    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    //copy to device
    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);


    matrixAdd<<<numBlocks,threads>>>(d_A, d_B, d_C, N_1, N_2);

    //copy back to host
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    t = clock() - t;


    printf("Program executed at %f seconds", ((float)t) / CLOCKS_PER_SEC);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

cudaProfilerStop();


    return 0;
    }
 

在 cuda c 中使用矩阵加法,代码执行但在使用 nvprof 对其进行分析时。它说没有内核分析。

4

0 回答 0