cuda - 在 CUDA npp 中使用多个流

Question

我想让代码同时旋转图像。
首先，我使用 nppiRotate_8u_C1R 检查了代码在默认流版本中运行良好，并将其更改为 4 流版本。32 个图像被复制到 4 个流中并使用 nppiRotate_8u_C1R_Ctx 旋转。

输入图像的大小为 2048*2048，但结果图像看起来与默认流版本不同。
我用 Nsight 系统分析了这两个代码，发现内核启动配置彼此不同。

在默认流版本中，线程数等于像素数（左）。
但是，线程数为 128128，远小于输入图像的大小（右）。
旋转后的图像仅出现在左上角的 128128 区域。

我该如何解决？或者有没有人遇到过和我一样的问题？

操作系统：Windows 10
GPU：RTX 3090
CUDA 版本：11.1

#include <iostream>
#include <stdio.h>

#include "cuda_runtime.h"
#include "helper_cuda.h"
#include "helper_functions.h"

#include <opencv2/opencv.hpp>

#include <nppi.h>
#include <nppcore.h>
#include <nppdefs.h>

using namespace std;
using namespace cv;

Mat read_BMP_opencv(char* filename, int& w, int& h);

int main()
{
    int f_width, f_height;
    char buf[256];

    cudaEvent_t start, stop;
    float  elapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int img_num = 32; 
    const int stream_num = 4;
    int n_iter = 50; 

    cudaStream_t stream[stream_num];
    NppStreamContext nppStreamContext[stream_num];
    cudaDeviceProp oDeviceProperties;

    for (int n = 0; n < stream_num; n++)
    {
        cudaStreamCreate(&stream[n]);
        nppStreamContext[n].hStream = stream[n];

        cudaDeviceGetAttribute(&nppStreamContext[n].nCudaDevAttrComputeCapabilityMajor,
            cudaDevAttrComputeCapabilityMajor,
            nppStreamContext[n].nCudaDeviceId);

        cudaDeviceGetAttribute(&nppStreamContext[n].nCudaDevAttrComputeCapabilityMinor,
            cudaDevAttrComputeCapabilityMinor,
            nppStreamContext[n].nCudaDeviceId);

        cudaStreamGetFlags(nppStreamContext[n].hStream, &nppStreamContext[n].nStreamFlags);

        cudaGetDevice(&nppStreamContext[n].nCudaDeviceId);
        cudaGetDeviceProperties(&oDeviceProperties, nppStreamContext[n].nCudaDeviceId);

        nppStreamContext[n].nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
        nppStreamContext[n].nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
        nppStreamContext[n].nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
        nppStreamContext[n].nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
    }

    cv::Mat::setDefaultAllocator(cv::cuda::HostMem::getAllocator(cv::cuda::HostMem::AllocType::PAGE_LOCKED));

    double angle = 3.8;

    ///////////////////////////*********************** Memory Allocation ***********************/////////////////////////

    NppiSize srcSize = { f_width, f_height };
    NppiRect srcROI = { 0, 0, f_width, f_height };
    NppiRect dstROI = { 0, 0, f_width, f_height };

    Mat* img = new Mat[img_num];
    Mat* result = new Mat[img_num];

    Npp8u* *data = new Npp8u*[img_num];
    Npp8u* *d_data = new Npp8u*[img_num];
    Npp8u* *d_rotated = new Npp8u*[img_num];
    Npp8u* *h_result = new Npp8u*[img_num];



    for (int i = 0; i < img_num; i++)
    {
        img[i] = read_BMP_opencv("input_images/test_2048_2.bmp", f_width, f_height);
        data[i] = img[i].data;
    }

    for (int i = 0; i < img_num; i++)
    {
        cudaMalloc((void**)&d_data[i], sizeof(Npp8u)*f_width*f_height);
        cudaMalloc((void**)&d_rotated[i], sizeof(Npp8u)*f_width*f_height);
        cudaMallocHost((void**)&h_result[i], sizeof(Npp8u)*f_width*f_height);

    }

    ///////////////////////////*********************** NPP rotation ***********************/////////////////////////
    cudaEventRecord(start, 0);
    for (int n = 0; n < n_iter; n++)
    {
        for (int i = 0; i < int(img_num / stream_num); i++)
        {
            for (int j = 0; j < stream_num; j++)
            {
                cudaMemcpyAsync(d_data[i*stream_num+j], data[i*stream_num + j], sizeof(Npp8u)*f_width*f_height, cudaMemcpyHostToDevice, stream[j]);

                // NPP 10.2 and beyond contain an additional element in the NppStreamContext structure
                nppiRotate_8u_C1R_Ctx(d_data[i*stream_num + j], srcSize, f_width, srcROI, d_rotated[i*stream_num + j], f_width, dstROI, angle, 0, 0, NPPI_INTER_LINEAR, nppStreamContext[j]);

                cudaMemcpyAsync(h_result[i*stream_num + j], d_rotated[i*stream_num + j], sizeof(Npp8u)*f_width*f_height, cudaMemcpyDeviceToHost, stream[j]);
            }
        }
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    printf("Average Rotation Time: %3.1f ms\n", elapsedTime / n_iter);

    for (int i = 0; i < img_num; i++)
    {
        result[i] = Mat(f_height, f_width, CV_8UC1);
        result[i].data = h_result[i];

        sprintf(buf, "output_images/test2048_rotated_nppi_%d.bmp", i);
        imwrite(buf, result[i]);
    }

    return 0;

}

Mat read_BMP_opencv(char* filename, int& w, int& h)
{
    Mat input_img = imread(filename, 0);
    if (input_img.empty())
        throw "Argument Exception";

    // extract image height and width from header
    int width = input_img.cols;
    int height = input_img.rows;

    w = width;
    h = height;

    return input_img;
}

cuda - 在 CUDA npp 中使用多个流

0 回答 0

Related

Reference