-1

我想在 C++ 中理解和重现提供双线性插值的 Cuda Npp 函数的结果。但无论我做什么,我都无法到达那里。我读到了纹理用于插值的特殊固定分数格式,但我认为这不适用于这里,因为我尝试这样做会得到更糟糕的结果。尝试了不同的方法进行插值本身无济于事。NPP 如何进行双线性插值?一些代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"

#include <iostream>
#include <iomanip>
#include <vector>

int main() {
    const NppLibraryVersion* libVer = nppGetLibVersion();
    std::cout << "NPP Library Version " << libVer->major << "." << libVer->minor << "." << libVer->build << std::endl;

    //sizes and dimensions
    int w = 10;
    int h = 4;
    int wbytes = w * sizeof(double);
    size_t siz = wbytes * h;

    //setup test data
    std::vector<double> input(w * h);
    std::vector<double> xmap(w * h);
    std::vector<double> ymap(w * h);
    for (int i = 0; i < input.size(); i++) {
        input[i] = (double) (i) * 10.0;
        xmap[i] = (double) (i % w) / 10.0;
        ymap[i] = (double) (i / w) / 10.0;
    }
    std::vector<double> output(w * h);

    //prepare device memory
    double* d_input;
    cudaMalloc(&d_input, siz);
    cudaMemcpy(d_input, input.data(), siz, cudaMemcpyDefault);
    double* d_xmap;
    cudaMalloc(&d_xmap, siz);
    cudaMemcpy(d_xmap, xmap.data(), siz, cudaMemcpyDefault);
    double* d_ymap;
    cudaMalloc(&d_ymap, siz);
    cudaMemcpy(d_ymap, ymap.data(), siz, cudaMemcpyDefault);
    double* d_output;
    cudaMalloc(&d_output, siz);

    //NPPI remap function
    NppiSize roi = { w, h };
    NppiRect rect = { 0, 0, w, h };
    nppiRemap_64f_C1R(d_input, roi, wbytes, rect, d_xmap, wbytes, d_ymap, wbytes, d_output, wbytes, roi, NPPI_INTER_LINEAR);
    cudaMemcpy(output.data(), d_output, siz, cudaMemcpyDefault);

    //check results
    std::cout << std::setprecision(18);
    for (int k = 0; k < h; k++) {
        for (int i = 0; i < w; i++) {
            int idx = k * w + i;
            double x = xmap[idx];
            double y = ymap[idx];

            double flx = std::floor(x), fly = std::floor(y);
            double dx = x - flx, dy = y - fly;
            int ix = (int) flx, iy = (int) fly;

            double z00 = input[iy * w + ix];
            double z01 = input[iy * w + ix + 1];
            double z10 = input[iy * w + w + ix];
            double z11 = input[iy * w + w + ix + 1];
            double resultHost = z00 + dy * (z10 - z00) + dx * (z01 - z00) + dx * dy * (z00 + z11 - z10 - z01);
            double resultDevice = output[idx];
            double delta = std::abs(resultHost - resultDevice);

            if (delta != 0.0) {
                std::cout << "[" << k << " " << i << "] " << delta << std::endl;
                std::cout << "x " << x << std::endl << "y " << y << std::endl;
                std::cout << "NPP " << resultDevice << std::endl << "CPU " << resultHost << std::endl;
            }
        }
    }

    cudaError_t err = cudaGetLastError();
    std::cout << std::endl << "cuda error: " << cudaGetErrorString(err) << std::endl;
    return err;
}

产生结果

NPP Library Version 11.5.1
[2 4] 1.9073486328125e-06
x 0.400000000000000022
y 0.200000000000000011
NPP 24.0000019073486328
CPU 24
[2 9] 1.9073486328125e-06
x 0.900000000000000022
y 0.200000000000000011
NPP 29.0000019073486328
CPU 29
[3 0] 1.9073486328125e-06
x 0
y 0.299999999999999989
NPP 30.0000019073486328
CPU 30
[3 1] 1.9073486328125e-06
x 0.100000000000000006
y 0.299999999999999989
NPP 31.0000019073486328
CPU 31
[3 7] 3.814697265625e-06
x 0.699999999999999956
y 0.299999999999999989
NPP 37.0000038146972656
CPU 37

cuda error: no error
4

0 回答 0