前段时间我写了下面的代码做类似的事情:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void fast_finder(unsigned int *g_found, float x, float *y)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int pos = (unsigned int)(x == y[i]);
g_found[i * (1 - pos)] = i * pos;
}
int main(int argc, char *argv[])
{
int N = 65536;
unsigned int h_found, *d_found;
float *h_y = (float *)malloc(N * sizeof(float)), *d_y, x = 5.0f;
int nThreads = 1024, nBloks = N / nThreads;
for (int i = 0; i < N; ++i) h_y[i] = (float)(N - i - 1);
if (x != h_y[0]) {
cudaSetDevice(0);
cudaMalloc((void **)&d_found, N * sizeof(unsigned int));
cudaMalloc((void **)&d_y, N * sizeof(float));
cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice);
fast_finder<<<nBloks, nThreads>>>(d_found, x, d_y);
cudaThreadSynchronize();
cudaMemcpy(&h_found, d_found, sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (h_found) printf("%g found on %d. position!\n", x, h_found);
else printf("%g not found!\n", x);
cudaFree(d_y);
cudaFree(d_found);
} else printf("%g found on the first position!\n", x);
free(h_y);
getchar();
return EXIT_SUCCESS;
}
这里每个线程检查全局线程索引 in 提供的值y
是否等于x
。如果它是真的,线程将它的索引写入g_found
数组的第一个位置,否则将 0 写入g_found
它的索引提供的位置。对于y
长度为 16 的,y
输出中第 11 位包含值 5 如下:
g_found = { 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
在这种情况下y
,不需要排序,但必须只包含唯一值。此代码可以很容易地更改为x
将插入提供的查找(设备部分)索引,如下所示:
__global__ void fast_finder(unsigned int *g_found, float x, float *y)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int pos = (unsigned int)(x >= y[i] || x <= y[i+1]);
g_found[i * (1 - pos)] = (i + 1) * pos;
}
这个版本的输出与我的相似。当g_found
位置 0 为 0x
时,数组中不存在的值y
。的第一个元素y
是否等于x
由主机代码检查,甚至在内核被调用之前。更改此部分以应用您想要的条件也不是问题。
如您所见,在这样的解决方案中,所有线程一起工作,并且不需要任何执行终止,只要x
找到。还可以应用数据包搜索,这意味着分配一个线程在 的一小部分中搜索y
,从而允许y
更大。