我用 NPP 写了一个 BoxFilter 的例子,但是输出的图像看起来坏了。这是我的代码:
#include <stdio.h>
#include <string.h>
#include <ImagesCPU.h>
#include <ImagesNPP.h>
#include <Exceptions.h>
#include <npp.h>
#include "utils.h"
void boxfilter1_transform( Npp8u *data, int width, int height ){
size_t size = width * height * 4;
// declare a host image object for an 8-bit RGBA image
npp::ImageCPU_8u_C4 oHostSrc(width, height);
Npp8u *nDstData = oHostSrc.data();
memcpy(nDstData, data, size * sizeof(Npp8u));
// declare a device image and copy construct from the host image,
// i.e. upload host to device
npp::ImageNPP_8u_C4 oDeviceSrc(oHostSrc);
// create struct with box-filter mask size
NppiSize oMaskSize = {3, 3};
// Allocate memory for pKernel
Npp32s hostKernel[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
Npp32s *pKernel;
checkCudaErrors( cudaMalloc((void**)&pKernel, oMaskSize.width * oMaskSize.height * sizeof(Npp32s)) );
checkCudaErrors( cudaMemcpy(pKernel, hostKernel, oMaskSize.width * oMaskSize.height * sizeof(Npp32s),
cudaMemcpyHostToDevice) );
Npp32s nDivisor = 9;
// create struct with ROI size given the current mask
NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1, oDeviceSrc.height() - oMaskSize.height + 1};
// allocate device image of appropriatedly reduced size
npp::ImageNPP_8u_C4 oDeviceDst(oSizeROI.width, oSizeROI.height);
// set anchor point inside the mask
NppiPoint oAnchor = {2, 2};
// run box filter
NppStatus eStatusNPP;
eStatusNPP = nppiFilter_8u_C4R(oDeviceSrc.data(), oDeviceSrc.pitch(),
oDeviceDst.data(), oDeviceDst.pitch(),
oSizeROI, pKernel, oMaskSize, oAnchor, nDivisor);
//printf("NppiFilter error status %d\n", eStatusNPP);
NPP_DEBUG_ASSERT(NPP_NO_ERROR == eStatusNPP);
// declare a host image for the result
npp::ImageCPU_8u_C4 oHostDst(oDeviceDst.size());
// and copy the device result data into it
oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch());
memcpy(data, oHostDst.data(), size * sizeof(Npp8u));
return;
}
大部分代码是从示例 boxFilterNPP.cpp 复制而来的。和输出图像:http: //img153.imageshack.us/img153/7716/o8z.png
为什么会这样?