According to this article, sum reduction with CUB Library should be one of the fastest way to make parallel reduction. As you can see in a code fragment below, the execution time is measure excluding first cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, in, out, N, cub::Sum());
I assume that it's something connected with memory preparation and when we reduce several times the same data it isn't neccesary to call it every time but when I've got many different arrays with the same number of elements and type of data do I have to do it every time? If the answer is yes, it means that usage of CUB Library becomes pointless.
size_t temp_storage_bytes;
int* temp_storage=NULL;
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, in, out, N, cub::Sum());
cudaMalloc(&temp_storage,temp_storage_bytes);
cudaDeviceSynchronize();
cudaCheckError();
cudaEventRecord(start);
for(int i=0;i<REPEAT;i++) {
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, in, out, N, cub::Sum());
}
cudaEventRecord(stop);
cudaDeviceSynchronize();