if the d_in is the pointer to some GPU memory (device), how to initialize the pointer of d_out?
You use cudaMalloc
, similar to how you would initialize the d_in
pointer.
if the operation of ArgMin() is finished in the device (GPU), how can I copy the result to my CPU?
You use cudaMemcpy
, similar to how you would copy the d_in
data from host to device, except now you are copying the d_out
data from device to host. The KeyValuePair is a C++ object that has key
and value
members.
Here is a complete example:
$ cat t37.cu
#include <cub/cub.cuh>
#include <iostream>
int main(){
// Declare, allocate, and initialize device-accessible pointers for input and output
int num_items = 32;
int *d_in;
cub::KeyValuePair<int, int> *d_out;
int *h_in = new int[num_items];
cub::KeyValuePair<int, int> *h_out = new cub::KeyValuePair<int, int>;
cudaMalloc(&d_in, num_items*sizeof(d_in[0]));
cudaMalloc(&d_out, sizeof(cub::KeyValuePair<int, int>));
for (int i = 0; i < num_items; i++) h_in[i] = 4;
h_in[12] = 2; // so we expect our return tuple to be 12,2
cudaMemcpy(d_in, h_in, num_items*sizeof(d_in[0]), cudaMemcpyHostToDevice);
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run argmin-reduction
cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaMemcpy(h_out, d_out, sizeof(cub::KeyValuePair<int, int>), cudaMemcpyDeviceToHost);
std::cout << "minimum value: " << h_out[0].value << std::endl;
std::cout << "index of min: " << h_out[0].key << std::endl;
}
$ nvcc -o t37 t37.cu -arch=sm_35 -std=c++14 -Wno-deprecated-gpu-targets
$ ./t37
minimum value: 2
index of min: 12
$