cuda - Simple multiplication on cuda GPU giving error

Question

I am doing the following on the GPU

float decPart = valAtIndex - (int)valAtIndex;
int docID = decPart * numDocs;

where valAtIdex if of type float and numDocs also of type float. For my case, decPart was 0.2 and numDocs was 10. However, when I print docID, it is printed as 1 (it should be 2). Can somebody please tell me where am I making a mistake?

Below is the full method if it helps

__global__ 
void finalNc(float* scSortedCounts, int* pos, int* maxCountEx, float numDocs, 
             int lengthStreamCompacted, int* finalNc, int actualLengthPos, 
             float* val, int* docIndex, int* acV ,int* ptwrite,int* diff,
             int* posIndex)
{ 
    int index = blockDim.x * blockIdx.x + threadIdx.x; 
    if(index < lengthStreamCompacted){ 
        float valAtIndex = scSortedCounts[index]; 
        float decPart = valAtIndex - (int)valAtIndex; 
        int docID = decPart * numDocs; 
        int actualCount = (int)valAtIndex; 
        int placeToWrite = maxCountEx[docID] + actualCount; 
        if( index == (lengthStreamCompacted -1 )){ 
            finalNc[placeToWrite] = actualLengthPos - pos[index]; 
        }else{ 
            finalNc[placeToWrite] = pos[index + 1] + pos[index]; 
        } 
    } 
}

score 3 · Accepted Answer

For the case you have mentioned, it appears that the result you are seeing is correct, and the source of your confusion is down to IEEE single precision representation of the intermediate results in your calculations (and perhaps rounding in printing or displaying of those intermediate results).

For the example you provide, a value of 0.2f is not exactly representable as a binary32 value. The two possible values are either

3E4CCCCC (1.99999988079071044921875E-1)

or

3E4CCCCD (2.0000000298023223876953125E-1)

If the first value was the actual value of docID, then the intermediate calculation you mention should produce 1 (which is what you observed). If it was second value, the result would be 2. This is absolutely expected behaviour.

To illustrate the effect of IEEE rounding modes and help put your mind at ease that there is no error here, have a look at the following example code, which performs the calculation you are asking about with one of three possible float to integer conversions - plain truncation, IEEE 754 round towards minus infinity, and IEEE 754 round towards plus infinity. I have templated the kernel and run it for each case on a set of random float values between 1 and 10. You can compile and run it for yourself and verify that the plain truncation (as in your code) really does work as intended, as well as the behaviour of the IEEE compliant "round up" and "round down" conversions on the output.

#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/generate.h>
#include <cstdlib>
#include <cstdio>

template<int version>
__global__
void kernel(const float *inputs, int *outputs, int numDocs, int N)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x; 

    if(index < N){ 
        float valAtIndex = inputs[index];
        int intPart;
        switch(version) {
            case 2:
                intPart = __float2int_ru(valAtIndex);
                break;
            case 1:
                intPart = __float2int_rd(valAtIndex);
                break;
            case 0:
            default:
                intPart = int(valAtIndex);
                break;
        }
        float decPart = valAtIndex - intPart;
        int docID = decPart * numDocs;
        outputs[index] = docID;
    }
}

inline float frand(){
    return 1.0f + 9.0f * ((float)rand()/(float)RAND_MAX);
}

int main(void)
{
    const size_t N = 100;
    const int numdocs = 10;

    srand(time(NULL));

    thrust::host_vector<float> values(N);
    thrust::host_vector<int> outputs(3*N);
    std::generate(values.begin(), values.end(), frand);

    thrust::device_ptr<float> in = thrust::device_malloc<float>(N);
    float * _in = thrust::raw_pointer_cast(in);
    thrust::copy(values.begin(), values.end(), in);

    thrust::device_ptr<int> out = thrust::device_malloc<int>(3*N);
    int * _out = thrust::raw_pointer_cast(out);

    kernel<0><<<1,128>>>(_in, _out, numdocs, N);
    kernel<1><<<1,128>>>(_in, _out+N, numdocs, N);
    kernel<2><<<1,128>>>(_in, _out+(2*N), numdocs, N);

    thrust::copy(out, out+3*N, outputs.begin());

    for(int i=0; i<N; i++)
        printf("%.10f %d %d %d\n", 
                values[i], outputs[i], outputs[N+i], outputs[2*N+i]);

    return 0;
}

cuda - Simple multiplication on cuda GPU giving error

1 回答 1

Related

Reference