c++ - 在多个 Nvidia GPU 上使用以下代码进行分段错误

Question

当配置为在多个卡上运行（我有一个 4 GPU 系统（NVIDIA Titan's））时，以下代码在一段时间后可靠地导致分段错误。这是我之前尝试解决的问题的一个最小示例（请参见此处：从 libcuda.so.1 调用的 __pthread_getspecific 中的分段错误）。

代码作用的描述：

首先，它在每张卡上分配大量内存（在前 N 张卡上，其中 N 为 1-4），其想法是尽可能给卡施加压力。然后它继续启动 16 个线程，每个线程在一张随机卡上运行几个内核。它会重复几次，每次随机选择一张卡片，然后退出。主线程等待所有 16 个线程完成，然后无限期地重复该过程，或者直到其中一个线程由于某种原因失败。

在主线程的几次（通常大约 20 次，但可能或多或少）迭代之后，它会导致分段错误。堆栈通常如下所示：

#0  0x00007f164a71f43c in ?? () from /usr/lib/libcuda.so
#1  0x00007f164a6bd1b5 in ?? () from /usr/lib/libcuda.so
#2  0x00007f164a5dff1a in ?? () from /usr/lib/libcuda.so
#3  0x00007f164a6c0b34 in ?? () from /usr/lib/libcuda.so
#4  0x00007f164a6c0c92 in ?? () from /usr/lib/libcuda.so
#5  0x00007f164a5e009f in ?? () from /usr/lib/libcuda.so
#6  0x00007f164a5d03c0 in ?? () from /usr/lib/libcuda.so
#7  0x00007f164a5c43bf in ?? () from /usr/lib/libcuda.so
#8  0x00007f164c131c39 in ?? () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#9  0x00007f164c152879 in cudaDeviceSynchronize () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#10 0x0000000000401911 in TestCUDA(int, unsigned int, unsigned int, unsigned int const*, unsigned short*) ()
#11 0x00000000004012b0 in main (argc=0, argv=0x100000200) at main.cpp:208

完整的源代码：

主.cpp：

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <cuda_runtime.h>

class CriticalSection
{
    pthread_mutex_t cs;

public:

    CriticalSection();
    ~CriticalSection();

    void Lock( void );
    void Unlock( void );
};

CriticalSection::CriticalSection()
{
    assert( pthread_mutex_init( &cs, NULL ) == 0 );
}

CriticalSection::~CriticalSection()
{
    assert( pthread_mutex_destroy( &cs ) == 0 );
}

void CriticalSection::Lock( void )
{
    assert( pthread_mutex_lock( &cs ) == 0 );
}

void CriticalSection::Unlock( void )
{
    assert( pthread_mutex_unlock( &cs ) == 0 );
}

class DeviceWrapper
{
protected:

    CriticalSection m_cs;

public:

    int32_t     m_i32DeviceId;

    uint32_t*   m_pdu32Data;
    uint16_t*   m_pdu16Res;
    uint32_t    m_u32Count;

    DeviceWrapper();
    ~DeviceWrapper();

    void Lock( void );
    void Unlock( void );

    bool Init( const int32_t i32DevId, const uint32_t u32Count );
    bool Free();    
};

DeviceWrapper::DeviceWrapper()
{
    m_i32DeviceId = 0;
    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
}

DeviceWrapper::~DeviceWrapper()
{

}

void DeviceWrapper::Lock( void )
{
    m_cs.Lock();
}

void DeviceWrapper::Unlock( void )
{
    m_cs.Unlock();
}

bool DeviceWrapper::Init( const int32_t i32DevId, const uint32_t u32Count )
{
    if ( cudaSetDevice( i32DevId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to set device %d\n", i32DevId );
        return false;
    }

    if ( cudaMalloc( &m_pdu32Data, sizeof( uint32_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned int's on device %d\n", u32Count, i32DevId );
        return false;
    }

    if ( cudaMalloc( &m_pdu16Res, sizeof( uint16_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned short's on device %d\n", u32Count, i32DevId );
        return false;
    }

    m_u32Count = u32Count;
    m_i32DeviceId = i32DevId;   
    return true;
}

bool DeviceWrapper::Free()
{
    if ( cudaSetDevice( m_i32DeviceId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to set device %d\n", m_i32DeviceId );
        return false;
    }

    if ( cudaFree( m_pdu32Data ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu32Mem on device %d\n", m_i32DeviceId );
        return false;
    }

    if ( cudaFree( m_pdu16Res ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu16Mem on device %d\n", m_i32DeviceId );
        return false;
    }

    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
    m_i32DeviceId = 0;
    return true;
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res );

void* DoWork( void* pArg );

static bool bRun = true;

static DeviceWrapper devices[4];

int main( int argc, char* argv[] )
{
    if ( argc != 2 )
    {
        printf( "Usage: %s <number of cards to use>\n", argv[0] );
        return 1;
    }

    uint32_t u32CardsToUse = strtoul( argv[1], NULL, 0 );

    if ( !u32CardsToUse || u32CardsToUse > 4 )
    {
        printf( "Invalid argument, must be in range 1-4\n" );
        return 2;
    }

    for ( int32_t i = 0; i < u32CardsToUse; i++ )
    {
        if ( !devices[i].Init( i, 0x20000000 ) )
        {
            for ( uint32_t j = 0; j < i; j++ )
            {
                devices[j].Free();
            }

            printf( "Failed to init device %d\n", i );
            return 3;
        }
    }

    uint32_t u32IterationsCompleted = 0;

    while ( bRun )
    {
        pthread_t pWorkers[ 16 ];
        memset( pWorkers, 0, 16 * sizeof( pthread_t ) );

        for ( uint32_t i = 0; i < 16; i++ )
        {       
            int iReturnValue = pthread_create( &pWorkers[i], NULL, &DoWork, (void*)u32CardsToUse );

            if ( iReturnValue != 0 )
            {
                printf( "Error calling pthread_create: %d\n", iReturnValue );
                return 4;
            }
        }

        for ( uint32_t i = 0; i < 16; i++ )
        {
            pthread_join( pWorkers[i], NULL );
        }

        printf( "Iterations completed: %u\n", ++u32IterationsCompleted );
    }

    printf( "Finished\n" );
    fflush( stdout );
    return 0;
}

void* DoWork( void* pArg )
{
    uint32_t u32CardsToUse = uint32_t( pArg );

    uint32_t u32TestCount = (rand() % 4) + 4;

    for ( uint32_t i = 0; i < u32TestCount; i++ )
    {
        int32_t i32DeviceId = int32_t( rand() % u32CardsToUse );

        devices[ i32DeviceId ].Lock();

        if ( !TestCUDA( i32DeviceId, 1, devices[i32DeviceId].m_u32Count, devices[i32DeviceId].m_pdu32Data, devices[i32DeviceId].m_pdu16Res ) )
        {
            printf( "DoWork: Failure in executing TestCUDA for device %d (test number %u)\n", i32DeviceId, i );
            bRun = false;
            devices[ i32DeviceId ].Unlock();
            return NULL;
        }

        devices[ i32DeviceId ].Unlock();
    }

    return NULL;
}

cuda_test.cu：

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>

__global__ void HammingU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx );
        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ]++;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualByteU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 4;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualBitU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void OrderU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        uint32_t u32File = pu32Data[gidx];                  // 32-bit value to find the log2 of 
        uint32_t u32FileLog = 0;                            // result of log2 will go here
        uint32_t u32Shift = 0;

        u32FileLog = (u32File > 0xFFFF) << 4;
        u32File >>= u32FileLog;
        u32Shift = (u32File > 0xFF) << 3;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0xF) << 2;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0x3) << 1;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32FileLog |= (u32File >> 1);

        uint32_t u32Other = gidx;           // 32-bit value to find the log2 of 
        uint32_t u32OtherLog = 0;                           // result of log2 will go here
        u32Shift = 0;

        u32OtherLog = (u32Other > 0xFFFF) << 4;
        u32Other >>= u32OtherLog;
        u32Shift = (u32Other > 0xFF) << 3;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0xF) << 2;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0x3) << 1;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32OtherLog |= (u32Other >> 1);

        if ( u32FileLog >= u32OtherLog )
        {
            pu16Results[ gidx ] += uint16_t( u32FileLog - u32OtherLog );
        }
        else
        {
            pu16Results[ gidx ] += uint16_t( u32OtherLog - u32FileLog );
        }

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void LogU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        uint32_t u32Value = 0;

        if ( pu32Data[gidx] >= gidx )
        {
            u32Value = pu32Data[gidx] - gidx;
        }
        else
        {
            u32Value = gidx - pu32Data[gidx];
        }

        uint32_t u32Log = 0;        // result of log2 will go here
        uint32_t u32Shift = 0;

        u32Log = (u32Value > 0xFFFF) << 4;
        u32Value >>= u32Log;
        u32Shift = (u32Value > 0xFF) << 3;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0xF) << 2;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0x3) << 1;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Log |= (u32Value >> 1);

        pu16Results[ gidx ] += (uint16_t)u32Log;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualRetU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void HammingMulU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx ) << 5;
        gidx += blockDim.x * gridDim.x;
    }
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res )
{   
    for ( uint32_t i = 0; i < u32Iterations; i++ )
    {
        if ( cudaSetDevice( i32DeviceId ) != cudaSuccess )
        {
            return false;
        }

        if ( cudaMemset( pdu16Res, 0, u32Count * sizeof( uint16_t ) ) != cudaSuccess )
        {
            return false;
        }

        for ( uint32_t j = 0; j < 3; j++ )
        {
            HammingU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualByteU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualBitU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            OrderU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            LogU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualRetU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            HammingMulU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
        }

        cudaDeviceSynchronize();
    }

    return true;
}

生成文件：

IDIR_CUDA = -I. -I/usr/local/cuda-5.5/include
CC_CUDA = g++
CFLAGS_CUDA = -g $(IDIR_CUDA)
LIBS_CUDA = -lz -lpthread -lrt -ldl -L/usr/local/cuda-5.5/lib64 -lcudart -lcuda

all:
    nvcc -I/usr/include -arch=compute_35 -code=sm_35 --machine 64 --compile cuda_test.cu
    $(CC_CUDA) -fpermissive *.cpp *.o -o test_cuda.out $(CFLAGS_CUDA) $(LIBS_CUDA)
    @echo DONE TEST_CUDA BUILD

clean:
    rm -f *.o test_cuda.out

我正在使用带有最新驱动程序（319.32）的 CUDA 5.5，运行 Ubuntu Linux（64 位）。

我的问题是： 代码中是否存在可能导致这种行为的错误？为什么我只使用一张卡时没有看到崩溃？只是坠机的可能性要小得多吗？（似乎使用更多卡会使崩溃比使用更少卡更快地出现） 奖励：是否有其他人使用此代码看到崩溃？

score 0 · Accepted Answer

我能够找到解决崩溃的解决方法：

这个想法是，每个使用的 GPU都只有一个线程，其他工作线程将工作提交给这些线程以在 GPU 上运行。我能够在机器上运行测试应用程序一天而没有任何问题（完成了超过 6000 次迭代）。

c++ - 在多个 Nvidia GPU 上使用以下代码进行分段错误

1 回答 1

Related

Reference