使用 OpenCL,我似乎无法从 Radeon 7970 将超过 7MB/秒的数据提取到 i5 Desktop 的主内存中。
#include <iostream>
#include <Windows.h>
#include <CL/cl.h>
int main(int argc, char ** argv)
{
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
const char *source =
"__kernel void copytest(__global short* dst) {\n"
" __local short buff[1024];\n"
" for (int i = 0; i < 1024; i++) {\n"
" for (int j = 0; j < 1024; j++)\n"
" buff[j] = j;\n"
" (void)async_work_group_copy(&dst[i*1024], buff, 1024, 0);\n"
" }\n"
"}\n";
cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
clBuildProgram( program, 1, &device, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel( program, "copytest", NULL);
cl_mem buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 1024 * 1024 * 2, NULL, NULL);
const size_t global_work_size = 1;
clSetKernelArg(kernel, 0, sizeof(buf), (void*)&buf);
LARGE_INTEGER pcFreq = {}, pcStart = {}, pcEnd = {};
QueryPerformanceFrequency(&pcFreq);
QueryPerformanceCounter(&pcStart);
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
clFinish(queue);
QueryPerformanceCounter(&pcEnd);
std::cout << 2.0 * pcFreq.QuadPart / (pcEnd.QuadPart-pcStart.QuadPart) << "MB/sec";
}
如您所见,它只在一个工作单元上运行。我尝试用分布在多个 (64) 工作单元之间的循环替换 async_work_group_copy() ,但这没有帮助。
有什么方法可以让 Radeon 的内存速度超过 7MB/秒?我对数百 MB/秒感兴趣。英伟达会更快吗?