我正在编写快速的 JPEG 读取代码,我将把它用作一个更大项目的一部分。我决定将 CUDA 和 NPP 一起用于该任务,因为 NPP 已经实现了所有编码和解码功能。一切正常,直到我尝试使用nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R_NEW
函数运行逆 DCT。这似乎破坏了代码的完整性。运行 DCT 后,报告了几个 cudaFree 调用cudaErrorLaunchFailure
。使用 NSIGHT CUDA 调试器后,我可以看到启动 IDCT 函数报告CUDA Grid launch failed
错误。可能的原因是什么?如果我不使用 NSIGHT CUDA 调试器 IDCT 函数以NPP_NO_ERROR
,但仍会损坏设备指针。我附上了我认为相关的代码片段,但我可以根据要求提供更多。我有一种感觉,我可能在某些时候对指针感到困惑。不过,我已经花费了大量时间在 vs 调试器中检查和调试主机端内存。
实际IDCT部分:
void CJPEGDecoder::InverseDCT(CJPGFile* file, NppiDCTState* pDCTState, CJPEGDeviceData* dataNPP)
{
cudaError_t quantError, huffmanError, quantAllocError;
NppStatus DCTstatus;
Npp8u* deviceQuantizationTables;
quantAllocError = cudaMalloc(&deviceQuantizationTables, 64 * file->m_quantizationTables.size());
for (int i = 0; i < file->m_quantizationTables.size(); i++)
{
quantError = cudaMemcpyAsync(deviceQuantizationTables + i * 64, file->m_quantizationTables.at(i).aTable, 64, cudaMemcpyHostToDevice);
}
for (int i = 0; i < m_numComponent; i++)
{
int blockHeight = dataNPP[i].m_srcSize.height / 8;
huffmanError = cudaMemcpyAsync(dataNPP[i].m_devDCT, m_hostDCT[i], dataNPP[i].m_DCTStep*blockHeight, cudaMemcpyHostToDevice);
}
// Inverse DCT
for (int i = 0; i < m_numComponent; i++)
{
DCTstatus = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R_NEW(dataNPP[i].m_devDCT, dataNPP[i].m_DCTStep,
dataNPP[i].m_srcImage, dataNPP[i].m_srcImageStep,
deviceQuantizationTables + file->m_frameHeader.quantizationSelector[i] * 64,
dataNPP[i].m_srcSize,
pDCTState);
}
cudaFree(deviceQuantizationTables);
}
释放霍夫曼表时报告错误:
void CJPEGDecoder::HuffmanDealloc()
{
NppStatus DCerror, ACerror;
cudaError_t error;
for (int i = 0; i < m_numComponent; i++)
{
DCerror = nppiDecodeHuffmanSpecFreeHost_JPEG(apHuffmanDCTable[i]); //NPP_OK
ACerror = nppiDecodeHuffmanSpecFreeHost_JPEG(apHuffmanACTable[i]); //NPP_OK
error = cudaFreeHost(m_hostDCT[i]); // cudaErrorLaunchFailure if DCT was launched, cudaSuccess otherwise
}
}
销毁 CJPEGDeviceData 时报告的错误:
void CJPEGDeviceData::ClearData()
{
cudaError_t errorDCT, errorImg;
m_DCTStep = 0;
m_srcImageStep = 0;
errorDCT = cudaFree(m_devDCT); // cudaErrorLaunchFailure if DCT was launched, cudaSuccess otherwise
errorImg = cudaFree(m_srcImage); // cudaErrorLaunchFailure if DCT was launched, cudaSuccess otherwise
m_allocated = false;
}
dct 计算的实际调用及其周围环境:
void CJPEGWrapper::DecodeJPG()
{
int numComponents = m_JPGFile->m_frameHeader.numberOfComponents;
m_deviceData = new CJPEGDeviceData[numComponents];
uint8_t maxV{ 0 }, maxH{ 0 };
for (int i = 0; i < numComponents; i++)
{
uint8_t testH = m_JPGFile->m_frameHeader.samplingFactor[i] & 0x0F;
uint8_t testV = m_JPGFile->m_frameHeader.samplingFactor[i] >> 4;
if (testH > maxH)
maxH = testH;
if (testV > maxV)
maxV = testV;
}
m_JPGdecoder.SetImgSize(m_JPGFile->m_frameHeader.width, m_JPGFile->m_frameHeader.height,numComponents);
m_JPGdecoder.SetMaxMCUSize(maxH, maxV);
for (int i = 0; i < numComponents; i++)
{
m_JPGdecoder.DecodeMCU(m_JPGFile->m_frameHeader.samplingFactor[i],m_deviceData[i]);
}
m_JPGdecoder.HuffmanAlloc(m_JPGFile);
m_JPGdecoder.HuffmanDecode(m_JPGFile, m_deviceData);
m_JPGdecoder.InverseDCT(m_JPGFile, m_pDCTState, m_deviceData); // IDCT is launched here
m_JPGdecoder.HuffmanDealloc();
}
CJPEGDeviceData 类:
class CJPEGDeviceData
{
public:
NppiSize m_blockSize;
NppiSize m_srcSize;
Npp16s* m_devDCT;
Npp32s m_DCTStep;
Npp8u* m_srcImage;
Npp32s m_srcImageStep;
public:
CJPEGDeviceData();
CJPEGDeviceData(const CJPEGDeviceData& object);
CJPEGDeviceData(CJPEGDeviceData&& object);
~CJPEGDeviceData();
void AllocDevicePointers(NppiSize blocksSize);
void ClearData();
bool IsAllocated() const;
private:
bool m_allocated;
};
谁能帮助我了解发生了什么以及我做错了什么?即使我启动有问题的 IDCT 部分,cuda-memcheck 也不会报告任何错误,我只能在 VS 调试器中检测到错误。我相信读取文件本身可以正常工作,我已经对其进行了许多测试,因此初始数据应该没问题。问题始于设备数据。我还可以补充一点,在打开 IDCT 的情况下启动 CUDA 分析器会使应用程序崩溃并引发非零退出代码错误。否则它运行良好。