cuda - Cannot get max value in CUDA vector multiplication

Question

I multiplay each row from pB to each row from pA and put max value to pC. The problem is: in internal loop the only last row of receptors taken as "max value". As result the right column is totally wrong.

void TestCalcDotMax_2x5x3()
{
    const size_t m = 2;  // nReceptors
    const size_t k = 5;  // nSources
    const size_t n = 3;  // nChemicals

float pA[m * k] =   { 1, 2, 3, 4, 5
                    , 2, 4, 6, 8, 2};

float pB[k * n] =   { 9, 8, 7, 6, 5
                    , 4, 3, 2, 1, 9
                    , 8, 7, 6, 5, 4 };

float expected[k * n] = { 18, 32, 42, 48, 25
                        , 8, 12, 12,  8, 45
                        ,16, 28, 36, 40, 20 };

float pC[k * n] =   { 18, 32, 42, 48, 10
                    , 8, 12, 12,  8, 18
                    ,16, 28, 36, 40,  8 };

int rst = ::CalcDotMax( pA, pB, m, k, n, pC );

    CPPUNIT_ASSERT_EQUAL_MESSAGE( "passed processing",  0, rst ); 
}
// pDevB and pDevC nave the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC ) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if( i < m )
    {
        for( size_t j = 0; j < k; j++ )
        {
            const float value = pDevA[ i * k + j ] * pDevB[j];

            if( value > pDevC[j] )
            {
                pDevC[j] = value;
            }
        }
    }
}    

__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC, pfnMsg fnMsg )
{
    int nbrCtas = m;
    int threadsPerCta = 64;

    if( nbrCtas >= 32 ) 
    {
         nbrCtas = 32;
         threadsPerCta = 64;
    }
    float* pDevA = nullptr;
    float* pDevB = nullptr;
    float* pDevC = nullptr;

    cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );

    code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
    code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );

    code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice); 
    code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice); 
    code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice); 

    for( size_t index = 0; index < n * k; index += k )
    {
        KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
    }
    code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
    code = ::cudaFree( pDevA );
    code = ::cudaFree( pDevB );
    code = ::cudaFree( pDevC );

    return 0;
}

score 1 · Accepted Answer

Sorry, I missed at some point that you had edited your code.

The problem you are having is a race condition. In the failing case you are launching 2 blocks. The design of your algorithm is such that each block is operating on the same set of output elements (in pdevC). Therefore, since both blocks can execute simultaneously, both blocks can write to the same output elements simultaneously. This is a collision and there are two ways you can avoid it:

redesign your algorithm to partition the work differently between blocks. Instead of each block checking all (or the same set of) the output elements against a particular set of inputs, have each block only be responsible for a portion of the output elements but checking against all the inputs. This is a common code refactoring operation that is done when converting a sequential/serial algorithm, to one that runs in parallel.
use atomic operations to prevent the collisions from happening. If your algorithm only has a small amount of these types of collisions, it may be convenient and not very costly to use atomics. But when the algorithm uses atomics for every output element (perhaps multiple times, as in this case) it's probably better (for higher performance) to try to refactor the code as in method 1 above.

What follows is some code where I illustrate the second approach (because it is easier for me to write). There is no atomic function that provides an atomicMax operation on float, so I crafted my own, following the template given in the atomic functions documentation for creating arbitrary atomic operations using atomicCAS. That is what atomicMaxf is.

If you elect to use the first approach (recommended), I would point out that calling the kernel in a loop is probably not necessary for your algorithm. I would craft a new kernel that assigns one thread to each output point, and then computes all the necessary max operations on the various input points, in a loop (or nested loops) in the kernel. Since each thread is writing to one and only one unique output point, there is no possibility for write collisions between threads.

This code should provide correct results, anyway:

#include <stdio.h>

__device__ float atomicMaxf(float* address, float val)
{
    int *address_as_int =(int*)address;
    int old = *address_as_int, assumed;
    while (val > __int_as_float(old)) {
        assumed = old;
        old = atomicCAS(address_as_int, assumed,
                        __float_as_int(val));
        }
    return __int_as_float(old);
}

// pDevB and pDevC have the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if( i < m )
    {
        for( size_t j = 0; j < k; j++ )
        {
            const float value = pDevA[ i * k + j ] * pDevB[j];

            atomicMaxf(pDevC+j, value);
//            if( value > pDevC[j] )
//            {
//                pDevC[j] = value;
//            }
        }
    }
}


__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC )
{
    int nbrCtas = m;
    int threadsPerCta = 64;

    if( nbrCtas >= 32 )
    {
         nbrCtas = 32;
         threadsPerCta = 64;
    }
    float* pDevA = NULL;
    float* pDevB = NULL;
    float* pDevC = NULL;

    cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );

    code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
    code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );

    code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
    code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
    code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);

    for( size_t index = 0; index < n * k; index += k )
    {
        KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
    }
    code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
    code = ::cudaFree( pDevA );
    code = ::cudaFree( pDevB );
    code = ::cudaFree( pDevC );

    return 0;
}


void TestCalcDotMax_2x5x3()
{
    const size_t m = 2;  // nReceptors
    const size_t k = 5;  // nSources
    const size_t n = 3;  // nChemicals

  float pA[m * k] =   { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f
                    , 2.0f, 4.0f, 6.0f, 8.0f, 2.0f};

  float pB[k * n] =   { 9.0f, 8.0f, 7.0f, 6.0f, 5.0f
                    , 4.0f, 3.0f, 2.0f, 1.0f, 9.0f
                    , 8.0f, 7.0f, 6.0f, 5.0f, 4.0f };

  float expected[k * n] = { 18.0f, 32.0f, 42.0f, 48.0f, 25.0f
                        , 8.0f, 12.0f, 12.0f,  8.0f, 45.0f
                        ,16.0f, 28.0f, 36.0f, 40.0f, 20.0f };

  float pC[k * n] =   { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
                    , 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
                    , 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };

  int rst = ::CalcDotMax( pA, pB, m, k, n, pC );

  printf("passed processing: %d \n",  rst );
  for (int i=0; i<(k*n); i++)
    if (pC[i] != expected[i]) printf("mismatch at %d, should be: %f was: %f\n", i, expected[i], pC[i]);
}

int main(){


  TestCalcDotMax_2x5x3();
  return 0;

}

score 0 · Accepted Answer

Thanks a lot - it works now. Is possible to keep the index of iteratiion [idx] at the moment of comparing? Like this:

struct ValIndex_t
{
    float  value;
    int    index;
};

__device__ float atomicMaxPare( float* address, float val, int* index, int idx )
{
    int *address_as_int = reinterpret_cast<int*>( address->value ); // assume that float has size of integer 32 bit
    int old = *address_as_int, assumed;

    while( val > ::__int_as_float(old) ) 
    {
        assumed = old;
        old     = ::atomicCAS( address_as_int, assumed, ::__float_as_int(val) );
        *index  = idx;
    }
    return ::__int_as_float(old);
}

__global__ void CudaPareDotMax( float* pDevA, const float* pDevB, ValIndex_t* pDevC, const size_t m, const size_t k, const size_t n ) 
{ 
    int idx = blockDim.x * blockIdx.x + threadIdx.x;

    if( idx < m )
    {
        for( size_t row = 0; row < n; row++ )
        {
            for( size_t col = 0; col < k; col++ )
            {
                const size_t slice = col + row * k;
                const size_t index = slice + k * n * idx;

                pDevA[index] *= pDevB[ col + k * idx ];

                float& prvalue = (pDevC +  slice )->value;
                int&   prindex = (pDevC + slice  )->index;

                ::atomicMaxPare( &prvalue, pDevA[ index ], &prindex, idx );
            }
        }
    }
}

Or I have to use another atomic function for exchange? Not quite understand how to join it exactly at the moment when value became max. Thanks again

cuda - Cannot get max value in CUDA vector multiplication

2 回答 2

Related

Reference