我的程序使用 CUDA 基数排序类。从 CUDA 4.0 更新到 4.2 后,类辅助初始化函数正在崩溃,并显示消息“围绕变量 'devprop' 的堆栈已损坏”。我已经隔离了注释一些函数代码的问题,发现 cudaGetDeviceProperties 正在破坏 devprop。我只是不知道为什么会发生这种情况以及如何解决问题。我的设置是 CUDA 4.2,开发驱动程序 301.32,Nsight 2.2,Windows 7 64 位,为 Win32 编译。以下代码段具有崩溃的 initDeviceParameters() 辅助函数:
namespace nvRadixSort
{
#include "radixsort.h"
#include "cudpp/cudpp.h"
#include <stdio.h>
#include <assert.h>
bool bManualCoalesce = false;
bool bUsePersistentCTAs = false;
void initDeviceParameters(bool keysOnly)
{
int deviceID = -1;
if(cudaSuccess == cudaGetDevice(&deviceID))
{
cudaDeviceProp devprop;
cudaGetDeviceProperties(&devprop, deviceID);
int smVersion = devprop.major * 10 + devprop.minor;
// sm_12 and later devices don't need help with coalesce in reorderData kernel
bManualCoalesce = (smVersion < 12);
bUsePersistentCTAs = (smVersion < 20);
if(bUsePersistentCTAs)
{
//Irrelevant. My setup is 2.1
}
}
}
}
这是相关的类代码:
#include <cuda_runtime_api.h>
#include "cudpp/cudpp.h"
namespace nvRadixSort
{
class RadixSort
{
public:
RadixSort(unsigned int maxElements, bool keysOnly = false)
: mScanPlan(0),
mNumElements(0),
mTempKeys(0),
mTempValues(0),
mCounters(0),
mCountersSum(0),
mBlockOffsets(0)
{
// Allocate temporary storage
initialize(maxElements, keysOnly);
}
protected: // data
CUDPPHandle mCudppContext;
CUDPPHandle mScanPlan; // CUDPP plan handle for prefix sum
unsigned int mNumElements; // Number of elements of temp storage allocated
unsigned int *mTempKeys; // Intermediate storage for keys
unsigned int *mTempValues; // Intermediate storage for values
unsigned int *mCounters; // Counter for each radix
unsigned int *mCountersSum; // Prefix sum of radix counters
unsigned int *mBlockOffsets; // Global offsets of each radix in each block
protected: // methods
void initialize(unsigned int numElements, bool keysOnly)
{
// initialize parameters based on present CUDA device
initDeviceParameters(keysOnly);
// Allocate temporary storage
mNumElements = numElements;
unsigned int numBlocks = ((numElements % (CTA_SIZE * 4)) == 0) ?
(numElements / (CTA_SIZE * 4)) : (numElements / (CTA_SIZE * 4) + 1);
unsigned int numBlocks2 = ((numElements % (CTA_SIZE * 2)) == 0) ?
(numElements / (CTA_SIZE * 2)) : (numElements / (CTA_SIZE * 2) + 1);
// Initialize scan
cudppCreate(&mCudppContext);
CUDPPConfiguration scanConfig;
scanConfig.algorithm = CUDPP_SCAN;
scanConfig.datatype = CUDPP_UINT;
scanConfig.op = CUDPP_ADD;
scanConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD;
cudppPlan(mCudppContext , &mScanPlan, scanConfig, 16 * numBlocks2, 1, 0);
cudaMalloc((void **)&mTempKeys, numElements * sizeof(unsigned int));
if(!keysOnly)
cudaMalloc((void **)&mTempValues, numElements * sizeof(unsigned int));
cudaMalloc((void **)&mCounters, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
cudaMalloc((void **)&mCountersSum, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
cudaMalloc((void **)&mBlockOffsets, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
checkCudaError("RadixSort::initialize()");
}
}