I multiplay each row from pB to each row from pA and put max value to pC. The problem is: in internal loop the only last row of receptors taken as "max value". As result the right column is totally wrong.
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1, 2, 3, 4, 5
, 2, 4, 6, 8, 2};
float pB[k * n] = { 9, 8, 7, 6, 5
, 4, 3, 2, 1, 9
, 8, 7, 6, 5, 4 };
float expected[k * n] = { 18, 32, 42, 48, 25
, 8, 12, 12, 8, 45
,16, 28, 36, 40, 20 };
float pC[k * n] = { 18, 32, 42, 48, 10
, 8, 12, 12, 8, 18
,16, 28, 36, 40, 8 };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
CPPUNIT_ASSERT_EQUAL_MESSAGE( "passed processing", 0, rst );
}
// pDevB and pDevC nave the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
if( value > pDevC[j] )
{
pDevC[j] = value;
}
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC, pfnMsg fnMsg )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = nullptr;
float* pDevB = nullptr;
float* pDevC = nullptr;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}