我目前正在移植一个 CUDA 代码,该代码在(无向)图中找到从每个节点到其他节点的最短路径。所以基本上,CUDA 代码构造了一个从文本文件中读取的图形。然后它继续构建相邻数组 h_v 和 h_e。
For example
A B
A C
B C
gives
h_v[0] = 0, h_e[0]=1
h_v[1] = 0, h_e[1]=2
h_v[2] = 1, h_e[2]=2
然后它调用内核使用 BFS 计算每个节点的最短路径。
cuda主机代码如下:
int cc_bfs(int n_count, int e_count, int *h_v, int *h_e, float *h_cc, bool ec){
int *d_v, *d_e;
cudaCheckError(cudaMalloc((void **)&d_v, sizeof(int)*e_count));
cudaCheckError(cudaMalloc((void **)&d_e, sizeof(int)*e_count));
cudaCheckError(cudaMemcpy(d_v, h_v, sizeof(int)*e_count, cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(d_e, h_e, sizeof(int)*e_count, cudaMemcpyHostToDevice));
int *d_d, *d_dist;
cudaCheckError(cudaMalloc((void **)&d_d, sizeof(int)*n_count));
cudaCheckError(cudaMalloc((void **)&d_dist, sizeof(int)));
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
bool *d_continue;
cudaCheckError(cudaMalloc((void**)&d_continue, sizeof(bool)));
for(int s=0; s<n_count; s++){ //BIG FOR LOOP
//////code to initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0; //for marking the root
cudaCheckError(cudaMemcpy(d_d, h_d, sizeof(int)*n_count, cudaMemcpyHostToDevice));
//////////////////////////////
///////////////////////////////
int threads_per_block=e_count;
int blocks=1;
if(e_count>MAX_THREADS_PER_BLOCK){
blocks = (int)ceil(e_count/(float)MAX_THREADS_PER_BLOCK);
threads_per_block = MAX_THREADS_PER_BLOCK;
}
dim3 grid(blocks);
dim3 threads(threads_per_block);
/////////////////////////////////
bool h_continue;
int h_dist=0;
cudaCheckError(cudaMemset(d_dist, 0, sizeof(int)));
do{
h_continue=false;
cudaCheckError(cudaMemcpy(d_continue, &h_continue, sizeof(bool), cudaMemcpyHostToDevice));
cc_bfs_kernel<<<grid, threads>>>(d_v, d_e, d_d, d_continue, d_dist, e_count);
checkCUDAError("Kernel invocation");
cudaThreadSynchronize();
h_dist++;
cudaCheckError(cudaMemcpy(d_dist, &h_dist, sizeof(int), cudaMemcpyHostToDevice));//for what?
cudaCheckError(cudaMemcpy(&h_continue, d_continue, sizeof(bool), cudaMemcpyDeviceToHost));
}while(h_continue);
///////////////////
//then code to read back h_d from device
}
这是cuda内核
__global__ void cc_bfs_kernel(int *d_v, int *d_e, int *d_d,
bool *d_continue, int *d_dist, int e_count){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<e_count){
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1){ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
这是我将其移植到 openCL 的努力。我只是openCL的业余爱好者,所以我正在尽力逐行移植原始代码:(
openCL 主机代码
cl_mem d_d= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*n_count, NULL,NULL);
cl_mem d_dist= clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(int), NULL,NULL);
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
cl_mem d_continue = clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(bool), NULL,NULL);
float* h_cc;
h_cc = (float *)malloc(sizeof(float)*n_count);
cl_mem d_v= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
cl_mem d_e= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, e_count * sizeof(int), host_v, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_e, CL_TRUE, 0, e_count * sizeof(int), host_e, 0, NULL, NULL);
size_t global_size= e_count;
for(int s=0; s<n_count; s++)
{ //BIG LOOP
//initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0;
//copy h_d to d_d
err = clEnqueueWriteBuffer(queue, d_d, CL_TRUE, 0,
n_count * sizeof(int), h_d, 0, NULL, NULL);
bool h_continue;
int h_dist=0;
int mark = 0;
int* h_id;
h_id= (int*) malloc(sizeof(int)*e_count);
cl_mem id= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,
sizeof(int)*e_count, NULL,NULL);
do{
h_continue=false;
err = clEnqueueWriteBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_v);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_e);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_d);
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_continue);
err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_dist);
err = clSetKernelArg(kernel, 5, sizeof(int), (void *)&e_count);
err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&id);
/////EXECUTE
cl_event sync1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
&global_size, NULL, 0, NULL, &sync1); //wait for this to finish (to synchronize)
err = clWaitForEvents(1, &sync1);
clReleaseEvent(sync1);
///////////////////
err = clEnqueueReadBuffer(queue, id, CL_TRUE, 0,
sizeof(int)*e_count, h_id, 0, NULL, NULL);
printf("e_count = %d error : %d\n",e_count, err);//check error?
for(int j = 0; j< e_count; j++)
{
printf("%d ",h_id[j]);
}
h_dist++;
mark++;//for debug
err = clEnqueueWriteBuffer(queue, d_dist, CL_TRUE, 0,
sizeof(int), &h_dist, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
}
while(h_continue);
err = clEnqueueReadBuffer(queue, d_d, CL_TRUE, 0,
n_count*sizeof(int), h_d, 0, NULL, NULL);
和 openCL 内核
__kernel void cc_bfs_kernel(__global int *d_v, __global int *d_e, __global int *d_d,
__global bool *d_continue, __global int *d_dist, const int e_count, __global int *id)
{
int tid = get_global_id(0)-get_global_offset(0);
//barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = 0; i< e_count; i++)
{
id[i]=i;
}
if(tid<e_count){
id[tid]= tid;
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1)
{ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
代码不能给出正确的结果,所以我通过打印一些值来调试它(内核内部的 tid,用于检查代码通过 while 循环的次数的标记值)。可悲的是, tid 给出了垃圾值,它只经过一次 while 循环。你能指出我在这里缺少什么吗?
我还有一个疑问:我怎样才能做类似 cudathreadsynchronize() 的事情?在这个版本的 openCL 中,我将 clEnqueueNDRangeKernel 与命令事件相关联并等待它,但显然我似乎没有工作:(
非常感谢。