卸载调用将函数的数据(参数)从主机传输到英特尔 MIC(至强融核协处理器 3120 系列)是否需要预定义的时间?
具体来说,我为要在 MIC 上执行的函数执行卸载调用 ("#pragma offload target(mic)")。该函数有 15 个参数(指针和变量),我已经确认参数在 MIC 上的正确传递。但是我已经简化了代码,目的是检查参数传递的时间,因此它只包含一个简单的“printf()”函数。我使用“sys/time.h”头文件的“gettimeofday()”来测量时间,如下面的代码所示:
主机的一些硬件信息: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz / CentOS release 6.8 / PCI Express Revision 2.0
主程序
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <string.h>
__attribute__ (( target (mic))) unsigned long long ForSolution = 0;
__attribute__ (( target (mic))) unsigned long long sufficientSol = 1;
__attribute__ (( target (mic))) float timer = 0.0;
__attribute__ (( target (mic))) void function(float *grid, float *displ, unsigned long long *li, unsigned long long *repet, float *solution, unsigned long long dim, unsigned long long numOfa, unsigned long long numLoops, unsigned long long numBlock, unsigned long long thread, unsigned long long blockGrid, unsigned long long station, unsigned long long bytesSol, unsigned long long totalSol, volatile unsigned long long *prog);
float *grid, *displ, *solution;
unsigned long long *li,repet;
volatile unsigned long long *prog;
unsigned long long dim = 10, grid_a = 3, numLoops = 2, numBlock = 0;
unsigned long long thread = 220, blockGrid = 0, station = 12;
unsigned long long station_at = 8, bytesSol, totalSol;
bytesSol = dim*sizeof(float);
totalSol = ((1024 * 1024 * 1024) / bytesSol) * bytesSol;
/******** Some memcpy() functions here for the pointers*********/
gettimeofday(&start, NULL);
#pragma offload target(mic) \
in(grid:length(dim * grid_a * sizeof(float))) \
in(displ:length(station * station_at * sizeof(float))) \
in(li:length(dim * sizeof(unsigned long long))) \
in(repet:length(dim * sizeof(unsigned long long))) \
out(solution:length(totalSol/sizeof(float))) \
in(dim,grid_a,numLoops,numBlock,thread,blockGrid,station,bytesSol,totalSol) \
in(prog:length(sizeof(volatile unsigned long long))) \
inout(ForSolution,sufficientSol,timer)
{
function(grid, displ, li, repet, solution, dim, grid_a, numLoops, numBlock, thread, blockGrid, station, bytesSol, totalSol, prog);
}
gettimeofday(&end, NULL);
printf("Time to tranfer data on Intel Xeon Phi: %f sec\n", (((end.tv_sec - start.tv_sec) * 1000000.0 + (end.tv_usec - start.tv_usec)) / 1000000.0) - timer);
printf("Time for calculations: %f sec\n", timer);
函数.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <string.h>
#include <omp.h>
void function(float *grid, float *displ, unsigned long long *li, unsigned long long *repet, float *solution, unsigned long long dim, unsigned long long numOfa, unsigned long long numLoops, unsigned long long numBlock, unsigned long long thread, unsigned long long blockGrid, unsigned long long station, unsigned long long bytesSol, unsigned long long totalSol, volatile unsigned long long *prog)
{
struct timeval timer_start, timer_end;
gettimeofday(&timer_start, NULL);
printf("Hello World!!!\n");
gettimeofday(&timer_end, NULL);
timer = ((timer_end.tv_sec - timer_start.tv_sec) * 1000000.0 + (timer_end.tv_usec - timer_start.tv_usec)) / 1000000.0 ;
}
终端结果:
Time to tranfer data on Intel Xeon Phi: 3.512706 sec
Time for calculations: 0.000002 sec
Hello World!!!
代码需要 3.5 秒才能完成“卸载目标”。以上结果正常吗?有什么方法可以减少卸载呼叫的显着时间延迟?