我目前正在使用 GPU 和在 CPU 上使用 C++ 来试验 OpenCL 代码的性能。我编写了计算总和 z = x + y 的程序,其中 z、x 和 y 是 GPU 和 CPU 的二维数组(矩阵)。在测试了这些程序之后,我发现 CPU 在计算这个总和时比 GPU 效率更高,因为 GPU 和 CPU 之间的 PCI 总线中的数据传输速度很慢。现在我想确定需要多少总和才能使 GPU 的使用比 CPU 更高效。我计划通过将总和 z = x + y 增加到 z = x + y + y + y + y + ... 等等来做到这一点。
仅仅通过增加这个特定问题的总和数量,是否有可能使使用 GPU 比使用 CPU 更有效?
仅供参考:我使用的是 nVIDIA GeForce GT 640 显卡和 i5 Intel 核心 CPU。
任何帮助将不胜感激。
编辑:
下面我在 CPU 上附加了我的代码:
int main(int argc, const char * argv[])
{
//This value determines the size of the nxn (square array)
int n = 1000;
//Allocating the memory for the nxn arrays of floats.
float **x = (float**)malloc(sizeof(float*)*n);
float **y = (float**)malloc(sizeof(float*)*n);
float **z = (float**)malloc(sizeof(float*)*n);
//Initializing the arrays.
for(int i = 0; i<n; i++){
x[i] = (float*)malloc(sizeof(float)*n);
y[i] = (float*)malloc(sizeof(float)*n);
z[i] = (float*)malloc(sizeof(float)*n);
for(int j = 0; j<n; j++){
x[i][j] = i+j;
y[i][j] = i+j;
}
}
for(int i = 0; i<n; i++){
for(int j = 0; j<n; j++){
z[i][j] = x[i][j] + y[i][j];
for(int k = 0; k < 100; k++){
z[i][j] += y[i][j];
}
}
}
return 0;
}
这是使用 OpenCL 的 C++:(用于复制数据并在 GPU 上执行内核)
int n = 1000;
for(int i = 0; i<n; i++)
{
//Writing the data from the host to the device
err = clEnqueueWriteBuffer(queue, d_xx, CL_TRUE, 0, sizeof(float)*n, h_xx[i], 0, NULL, NULL);
if(err != CL_SUCCESS){
std::cout << "Error: Could not write to buffer d_xx" << std::endl;
exit(1);
}
err = clEnqueueWriteBuffer(queue, d_yy, CL_TRUE, 0, sizeof(float)*n, h_yy[i], 0, NULL, NULL);
if(err != CL_SUCCESS){
std::cout << "Error: Could not write to buffer d_yy" << std::endl;
exit(1);
}
//Setting the Kernel Arguments
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_xx);
if(err != CL_SUCCESS){
std::cout << "Error: Could not set kernel argument h_xx." << std::endl;
exit(1);
}
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_yy);
if(err != CL_SUCCESS){
std::cout << "Error: Could not set kernel argument h_yy." << std::endl;
exit(1);
}
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_zz);
if(err != CL_SUCCESS){
std::cout << "Error: Could not set kernel argument h_zz." << std::endl;
}
work_units_per_kernel = n;
//Executing the Kernel
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);
if(err != CL_SUCCESS){
std::cout << "Error: Could not execute kernel." << std::endl;
exit(1);
}
//Reading the Data from the Kernel
err = clEnqueueReadBuffer(queue, d_zz, CL_TRUE, 0, n*(sizeof(float)), h_zz[i], 0, NULL, NULL);
if(err != CL_SUCCESS){
std::cout << "Error: Could not read data from kernel." << std::endl;
exit(1);
}
}
最后是在 GPU 上执行的内核代码:
__kernel void arraysum(__global const float *d_aa, __global const float *d_bb, __global float *d_cc)
{
int i = get_global_id(0);
d_cc[i] = d_aa[i] + d_bb[i];
for(int j = 0; j < 100; j++){
d_cc[i] += d_bb[i];
}
}