最终更新:已解决。WDDM 超时也是一个问题。从以下位置找到解决方案:WDDM timeout fix。谢谢罗伯特。
更新:感谢罗伯特指出过滤器的中心不是 0,0。不幸的是,如果过滤器增加,您发布的代码将对我造成破坏,例如 17x17。这可能是由于您没有考虑图像“侧面”的边框。无论如何,这是最新的代码,但仍然表现出与以前相同的问题......
//npp
#include "npp.h"
#include "nppi.h"
#include "device_launch_parameters.h"
#include <iostream>
int main() {
//Image size.
int imageWidth = 6592;
int imageHeight = 4400;
//Misc.
int bytesPerPixel = 2;
int totalPixels = imageWidth*imageHeight;
int filterSize = 17;
int halfFilter = filterSize/2;
cudaError success2;
NppStatus success1;
//Mask & Origin for CUDA.
NppiSize cudaMask;
cudaMask.height = filterSize;
cudaMask.width = filterSize;
NppiPoint cudaAnchor;
cudaAnchor.x = halfFilter;
cudaAnchor.y = halfFilter;
//ROI for CUDA.
int left = halfFilter;
int right = (imageWidth-1) - halfFilter;
int top = halfFilter;
int bot = (imageHeight-1) - halfFilter;
NppiSize cudaROI;
cudaROI.height = bot - top;
cudaROI.width = right - left;
//Step size.
int step = imageWidth * bytesPerPixel;
//Create a new "image".
unsigned short* image = new unsigned short[totalPixels];
for(int i=0; i<imageWidth; i++)
for(int j=0; j<imageHeight; j++)
image[j*imageWidth+i] = 10;
//Allocate mem on device.
Npp16u *dSrc, *dDst;
Npp8u *dBuf;
Npp32u bufferSize;
//This call always returns a bufferSize==0. That doesn't seem right...
success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
std::cout << bufferSize << std::endl;
success2 = cudaMalloc( (void**)&dBuf, bufferSize);
std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;
//Copy host image to device.
success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;
//Copy source to destination.
success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, cudaROI);
std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;
//Filter.
Npp32u offset = top*step + left*bytesPerPixel;
success1 = nppiFilterMedian_16u_C1R( dSrc + offset,
step,
dDst + offset,
step,
cudaROI, cudaMask, cudaAnchor, dBuf);
std::cout << "npp Filter returned: " << (int)success1 << std::endl;
//Copy resultant back to host.
success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;
//Clean.
success2 = cudaFree(dDst);
success2 = cudaFree(dBuf);
success2 = cudaFree(dSrc);
delete image;
system("pause");
return 0;
}
我正在尝试为 29mp 图像计算中值滤波器。过滤器尺寸为 13x13。图像的宽度和高度如下所示。由于未知原因,以下代码会崩溃,我想问是否有人知道为什么?
我注意到的奇怪的事情:
nppiFilterMedian_16u_C1R() 发生错误。该函数本身返回无错误条件,但以下 cudaMemcpy() 会。没有过滤器, cudaMemcpy() 工作得很好。
此外,获取 16 位过滤器的缓冲区大小始终返回大小为 0。我测试了 8 位和 32 位,它们返回非零值...
我认为这可能是 NPPI 库的一个错误(?)。它似乎与尺寸有关(如果您使用减小图像的宽度/高度,它将适用于 13x13 过滤器尺寸)。但是,我的过滤器尺寸需要达到 31x31。
其他重要信息:Windows x64 应用程序、CUDA 运行时 7.5、NPP 版本 7.5。GPU 设备是 Quadro k2200(4GB 全局内存)。