当我将 cublasIsamax 与常规内存分配器一起使用时 - 它工作正常。
int FindMaxIndex( const float* pVector, const size_t length )
{
int result = 0;
float* pDevVector = nullptr;
if( CUBLAS_STATUS_SUCCESS != ::cudaMalloc( (void**)&pDevVector, length * sizeof(float) ) )
{
return -1;
}
if( CUBLAS_STATUS_SUCCESS != ::cudaMemcpy( pDevVector, pVector, length * (int)sizeof(float), cudaMemcpyHostToDevice) )
{
return -2;
}
::cublasIsamax_v2( g_handle, length, pDevVector, 1, &result);
if( nullptr != pDevVector )
{
::cudaFree( pDevVector );
}
return result;
}
但是,如果尝试使用常量内存,它会失败并出现未知错误 N14。怎么了?复制到常量内存成功,但执行失败。
__constant__ float c_pIndex[ 255 ] = {0x00};
// the same function as GetIsMax but using CUBLAS function cublasIsamax_v2
int FindMaxIndexConst( const float* pVector, const size_t length, pfnMsg fnMsg )
{
int result = 0;
cudaError_t code = ::cudaMemcpyToSymbol( c_pIndex, pVector, length * sizeof(float), 0, cudaMemcpyHostToDevice );
if( cudaSuccess != code )
{
const char* szMsg = ::cudaGetErrorString ( code );
LogError3( L"[%d] [%hs] Could not allocate CUDA memory: %I64d pDevA", code, szMsg, (__int64)(length * sizeof(float)));
}
cublasStatus_t status = ::cublasIsamax_v2( g_handle, length, c_pIndex, 1, &result);
if( CUBLAS_STATUS_SUCCESS != status )
{
LogError2( L" [%d] Failed to execute <cublasIsamax_v2> : %I64d", status, (__int64)length );
}
return result;
}