我公司有两台 GTX 295,所以一台服务器总共有 4 个 GPU,我们有几台服务器。与 GPU 0、2 和 3 相比,我们 GPU 1 特别慢,所以我写了一个小速度测试来帮助找出问题的原因。
//#include <stdio.h>
//#include <stdlib.h>
//#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cutil.h>
__global__ void test_kernel(float *d_data) {
int tid = blockDim.x*blockIdx.x + threadIdx.x;
for (int i=0;i<10000;++i) {
d_data[tid] = float(i*2.2);
d_data[tid] += 3.3;
}
}
int main(int argc, char* argv[])
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device = 0; //SELECT GPU HERE
cudaSetDevice(device);
cudaEvent_t start, stop;
unsigned int num_vals = 200000000;
float *h_data = new float[num_vals];
for (int i=0;i<num_vals;++i) {
h_data[i] = float(i);
}
float *d_data = NULL;
float malloc_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_data, sizeof(float)*num_vals);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float mem_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float kernel_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
test_kernel<<<1000,256>>>(d_data);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf("cudaMalloc took %f ms\n",malloc_timer);
printf("Copy to the GPU took %f ms\n",mem_timer);
printf("Test Kernel took %f ms\n",kernel_timer);
cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost);
delete[] h_data;
return 0;
}
结果是
GPU0 cudaMalloc 耗时 0.908640 ms 复制到 GPU 耗时 296.058777 ms 测试内核耗时 326.721283 ms
GPU1 cudaMalloc 耗时 0.913568 ms 复制到 GPU 耗时663.182251 ms 测试内核耗时 326.710785 ms
GPU2 cudaMalloc 耗时 0.925600 ms 复制到 GPU 耗时 296.915039 ms 测试内核耗时 327.127930 ms
GPU3 cudaMalloc 耗时 0.920416 ms 复制到 GPU 耗时 296.968384 ms 测试内核耗时 327.038696 ms
如您所见,GPU 的 cudaMemcpy 是 GPU1 时间量的两倍。这在我们所有的服务器之间都是一致的,总是 GPU1 很慢。任何想法为什么会这样?所有服务器都运行 Windows XP。