此代码将返回具有最高浮点性能的设备
select_device_with_most_flops(find_devices());
这是内存最多的设备
select_device_with_most_memory(find_devices());
首先,find_devices()
返回系统中所有 OpenCL 设备的向量。select_device_with_most_memory()
简单明了并且使用getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()
.
浮点性能由以下等式给出:
FLOPs/s = cores/CU * CUs * IPC * 时钟频率
select_device_with_most_flops()
更困难,因为 OpenCL 仅提供计算单元 (CU)getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
的数量,对于 CPU 而言,线程数是线程数,而对于 GPU,则必须乘以每个 CU 的流处理器/ cuda 核心数,这是不同的对于 Nvidia、AMD 和 Intel 以及它们不同的微架构,通常在 4 到 128 之间。幸运的是,供应商包含在getInfo<CL_DEVICE_VENDOR>()
. 因此,根据供应商和 CU 的数量,可以计算出每个 CU 的核心数。
下一部分是 FP32 IPC 或每时钟指令。对于大多数 GPU,这是 2,而对于最近的 CPU,这是 32,请参阅https://en.wikipedia.org/wiki/FLOPS?oldformat=true#FLOPs_per_cycle_for_various_processors
没有办法直接在 OpenCL 中找出 IPC,所以CPU 的 32 只是一个猜测。可以使用设备名称和查找表来更准确。getInfo<CL_DEVICE_TYPE>()==CL_DEVICE_TYPE_GPU
如果设备是 GPU,则结果为 true。
最后一部分是时钟频率。OpenCL 以 MHz 为单位提供基本时钟频率getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()
。该设备可以提升更高的频率,因此这又是一个近似值。
所有这些一起给出了对浮点性能的估计。完整代码如下所示:
typedef unsigned int uint;
string trim(const string s) { // removes whitespace characters from beginnig and end of string s
const int l = (int)s.length();
int a=0, b=l-1;
char c;
while(a<l && ((c=s.at(a))==' '||c=='\t'||c=='\n'||c=='\v'||c=='\f'||c=='\r'||c=='\0')) a++;
while(b>a && ((c=s.at(b))==' '||c=='\t'||c=='\n'||c=='\v'||c=='\f'||c=='\r'||c=='\0')) b--;
return s.substr(a, 1+b-a);
}
bool contains(const string s, const string match) {
return s.find(match)!=string::npos;
}
vector<Device> find_devices() {
vector<Platform> platforms; // get all platforms (drivers)
vector<Device> devices_available;
vector<Device> devices; // get all devices of all platforms
Platform::get(&platforms);
if(platforms.size()==0) print_error("There are no OpenCL devices available. Make sure that the OpenCL 1.2 Runtime for your device is installed. For GPUs it comes by default with the graphics driver, for CPUs it has to be installed separately.");
for(uint i=0; i<(uint)platforms.size(); i++) {
devices_available.clear();
platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices_available); // CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU
if(devices_available.size()==0) continue; // no device of type device_type found in plattform i
for(uint j=0; j<(uint)devices_available.size(); j++) devices.push_back(devices_available[j]);
}
print_device_list(devices);
return devices;
}
Device select_device_with_most_flops(const vector<Device> devices) { // return device with best floating-point performance
float best_value = 0.0f;
uint best_i = 0; // index of fastest device
for(uint i=0; i<(uint)devices.size(); i++) { // find device with highest (estimated) floating point performance
const Device d = devices[i];
//const string device_name = trim(d.getInfo<CL_DEVICE_NAME>());
const string device_vendor = trim(d.getInfo<CL_DEVICE_VENDOR>()); // is either Nvidia, AMD or Intel
const uint device_compute_units = (uint)d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); // compute units (CUs) can contain multiple cores depending on the microarchitecture
const bool device_is_gpu = d.getInfo<CL_DEVICE_TYPE>()==CL_DEVICE_TYPE_GPU;
const uint device_ipc = device_is_gpu?2u:32u; // IPC (instructions per cycle) is 2 for GPUs and 32 for most modern CPUs
const uint nvidia = (uint)(contains(device_vendor, "NVIDIA")||contains(device_vendor, "vidia"))*(device_compute_units<=30u?128u:64u); // Nvidia GPUs usually have 128 cores/CU, except Volta/Turing (>30 CUs) which have 64 cores/CU
const uint amd = (uint)(contains(device_vendor, "AMD")||contains(device_vendor, "ADVANCED")||contains(device_vendor, "dvanced"))*(device_is_gpu?64u:1u); // AMD GCN GPUs usually have 64 cores/CU, AMD CPUs have 1 core/CU
const uint intel = (uint)(contains(device_vendor, "INTEL")||contains(device_vendor, "ntel"))*(device_is_gpu?8u:1u); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs have 1 core/CU
const uint device_cores = device_compute_units*(nvidia+amd+intel);
const uint device_clock_frequency = (uint)d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); // in MHz
const float device_tflops = 1E-6f*(float)device_cores*(float)device_ipc*(float)device_clock_frequency; // estimated device floating point performance in TeraFLOPs/s
if(device_tflops>best_value) { // device_memory>best_value
best_value = device_tflops; // best_value = device_memory;
best_i = i; // find index of fastest device
}
}
return devices[best_i];
}
Device select_device_with_most_memory(const vector<Device> devices) { // return device with largest memory capacity
float best_value = 0.0f;
uint best_i = 0; // index of fastest device
for(uint i=0; i<(uint)devices.size(); i++) { // find device with highest (estimated) floating point performance
const Device d = devices[i];
const float device_memory = 1E-3f*(float)(d.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/1048576ull); // in GB
if(device_memory>best_value) {
best_value = device_memory;
best_i = i; // find index of fastest device
}
}
return devices[best_i];
}
Device select_device_with_id(const vector<Device> devices, const int id) { // return device
if(id>=0&&id<(int)devices.size()) {
return devices[id];
} else {
print("Your selected device ID ("+to_string(id)+") is wrong.");
return devices[0]; // is never executed, just to avoid compiler warnings
}
}