我想在 C++ 中理解和重现提供双线性插值的 Cuda Npp 函数的结果。但无论我做什么,我都无法到达那里。我读到了纹理用于插值的特殊固定分数格式,但我认为这不适用于这里,因为我尝试这样做会得到更糟糕的结果。尝试了不同的方法进行插值本身无济于事。NPP 如何进行双线性插值?一些代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"
#include <iostream>
#include <iomanip>
#include <vector>
int main() {
const NppLibraryVersion* libVer = nppGetLibVersion();
std::cout << "NPP Library Version " << libVer->major << "." << libVer->minor << "." << libVer->build << std::endl;
//sizes and dimensions
int w = 10;
int h = 4;
int wbytes = w * sizeof(double);
size_t siz = wbytes * h;
//setup test data
std::vector<double> input(w * h);
std::vector<double> xmap(w * h);
std::vector<double> ymap(w * h);
for (int i = 0; i < input.size(); i++) {
input[i] = (double) (i) * 10.0;
xmap[i] = (double) (i % w) / 10.0;
ymap[i] = (double) (i / w) / 10.0;
}
std::vector<double> output(w * h);
//prepare device memory
double* d_input;
cudaMalloc(&d_input, siz);
cudaMemcpy(d_input, input.data(), siz, cudaMemcpyDefault);
double* d_xmap;
cudaMalloc(&d_xmap, siz);
cudaMemcpy(d_xmap, xmap.data(), siz, cudaMemcpyDefault);
double* d_ymap;
cudaMalloc(&d_ymap, siz);
cudaMemcpy(d_ymap, ymap.data(), siz, cudaMemcpyDefault);
double* d_output;
cudaMalloc(&d_output, siz);
//NPPI remap function
NppiSize roi = { w, h };
NppiRect rect = { 0, 0, w, h };
nppiRemap_64f_C1R(d_input, roi, wbytes, rect, d_xmap, wbytes, d_ymap, wbytes, d_output, wbytes, roi, NPPI_INTER_LINEAR);
cudaMemcpy(output.data(), d_output, siz, cudaMemcpyDefault);
//check results
std::cout << std::setprecision(18);
for (int k = 0; k < h; k++) {
for (int i = 0; i < w; i++) {
int idx = k * w + i;
double x = xmap[idx];
double y = ymap[idx];
double flx = std::floor(x), fly = std::floor(y);
double dx = x - flx, dy = y - fly;
int ix = (int) flx, iy = (int) fly;
double z00 = input[iy * w + ix];
double z01 = input[iy * w + ix + 1];
double z10 = input[iy * w + w + ix];
double z11 = input[iy * w + w + ix + 1];
double resultHost = z00 + dy * (z10 - z00) + dx * (z01 - z00) + dx * dy * (z00 + z11 - z10 - z01);
double resultDevice = output[idx];
double delta = std::abs(resultHost - resultDevice);
if (delta != 0.0) {
std::cout << "[" << k << " " << i << "] " << delta << std::endl;
std::cout << "x " << x << std::endl << "y " << y << std::endl;
std::cout << "NPP " << resultDevice << std::endl << "CPU " << resultHost << std::endl;
}
}
}
cudaError_t err = cudaGetLastError();
std::cout << std::endl << "cuda error: " << cudaGetErrorString(err) << std::endl;
return err;
}
产生结果
NPP Library Version 11.5.1
[2 4] 1.9073486328125e-06
x 0.400000000000000022
y 0.200000000000000011
NPP 24.0000019073486328
CPU 24
[2 9] 1.9073486328125e-06
x 0.900000000000000022
y 0.200000000000000011
NPP 29.0000019073486328
CPU 29
[3 0] 1.9073486328125e-06
x 0
y 0.299999999999999989
NPP 30.0000019073486328
CPU 30
[3 1] 1.9073486328125e-06
x 0.100000000000000006
y 0.299999999999999989
NPP 31.0000019073486328
CPU 31
[3 7] 3.814697265625e-06
x 0.699999999999999956
y 0.299999999999999989
NPP 37.0000038146972656
CPU 37
cuda error: no error