我想使用两个 GPU 来执行一个内核,然后使用 cufftXt 执行一个 FFT。数据大小可能有几 GB。我对在 2 个 GPU 上为内核分配内存的理解是,您应该将主机阵列分成两半并将前半部分发送到 GPU0,另一半发送到 GPU1。以下示例显示了如何做到这一点。
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<real_size){
float x = (i % (N+2));
if(x < N){
data3[i] = pow(data[i], 3.0f);
}
else{
data3[i] = 0.0f;
}
}
__syncthreads();
}
int main (int argc, char **argv) {
int x;
int N = 8;
int cplx_size = N * (N/2 + 1);
int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftReal)*real_size;
int half_real_size = real_size/2;
int half_mem_size = mem_size/2;
cufftReal *h_data = (cufftReal*)malloc(mem_size);
cufftReal *h_data3 = (cufftReal*)malloc(mem_size);
cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);
for(int i=0; i<real_size; i++){
x = (i % (N+2));
if(x < N){h_data[i] = 2;}
else{h_data[i] = 0;}
}
for(int i=0; i<half_real_size; i++){
h0_data[i] = h_data[i];
h1_data[i] = h_data[i+half_real_size];
}
cufftReal *d0_data;
cufftReal *d0_data3;
cufftReal *d1_data;
cufftReal *d1_data3;
cudaSetDevice(0);
gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
cudaSetDevice(1);
gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));
cout <<"device memory allocated" <<endl;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (half_real_size)/threadsPerBlock;
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));
cout <<"mem copied to devices" <<endl;
cudaSetDevice(0);
Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(1);
Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h0_data3[i] <<" ";
}
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h1_data3[i] <<" ";
}
//clean up
cudaFree(d0_data);
cudaFree(d0_data3);
cudaFree(d1_data);
cudaFree(d1_data3);
return 0;
}
但是,我看不出这种方法如何与 cufftXt 兼容。看来我应该使用辅助函数 cufftXtMemcpy 自动将数据拆分到设备上。但是如果我这样做了,那么上面显示的多 GPU 内核方法将不可用,除非我为 cufftXt 和内核分配单独的设备内存。有没有办法在不双重分配设备内存的情况下同时运行 cufftXt 和内核?