为了回答您的问题,我正在考虑四个不同的内核,其中每个线程for
在迭代中执行一个循环n_loop
。四个内核实现了四种不同的可能情况:
- 迭代次数
n_loop
在编译时是已知的;
- 迭代次数
n_loop
在编译时是已知的,总和是有条件的;
- 迭代次数
n_loop
在运行时是已知的;
- 迭代次数
n_loop
在运行时已知,并执行手动循环展开。
完整代码如下:
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 512
#define epsilon 0.5
#define n_loop 8
/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/****************************************************/
/* KERNEL #1: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
/****************************************************/
__global__ void testKernel1(float* input, float* output, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum = 0.f;
for (int i = 0; i < n_loop; i++) {
accum = accum + input[n_loop*tid+i];
}
output[tid] = accum;
}
}
/****************************************************/
/* KERNEL #2: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
/****************************************************/
__global__ void testKernel2(float* input, float* output, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum = 0.f;
for (int i = 0; i < n_loop; i++) if (input[n_loop*tid+i] < epsilon) accum = accum + input[n_loop*tid+i];
output[tid] = accum;
}
}
/************************************************/
/* KERNEL #3: NUMBER OF LOOPS KNOWN AT RUN-TIME */
/************************************************/
__global__ void testKernel3(float* input, float* output, int N_loop, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum = 0.f;
for (int i = 0; i < N_loop; i++) accum = accum + input[N_loop*tid+i];
output[tid] = accum;
}
}
/*******************************************************************/
/* KERNEL #4: NUMBER OF LOOPS KNOWN AT RUN-TIME - LOOP UNROLL OF 4 */
/*******************************************************************/
__global__ void testKernel4(float* input, float* output, int N_loop, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum1 = 0.f;
float accum2 = 0.f;
float accum3 = 0.f;
float accum4 = 0.f;
for (int i = 0; i < N_loop/4; i++) {
accum1 = accum1 + input[N_loop*tid+i];
accum2 = accum2 + input[N_loop*tid+i+N_loop/4];
accum3 = accum3 + input[N_loop*tid+i+2*N_loop/4];
accum4 = accum4 + input[N_loop*tid+i+3*N_loop/4];
}
output[tid] = accum1 + accum2 + accum3 + accum4;
}
}
int main() {
const int N = 512*512*32;
float* input = (float*) malloc(n_loop*N*sizeof(float));
float* output = (float*) malloc(N*sizeof(float));
float* output2 = (float*) malloc(N*sizeof(float));
float* outputif = (float*) malloc(N*sizeof(float));
float* d_input; gpuErrchk(cudaMalloc((void**)&d_input, n_loop*N*sizeof(float)));
float* d_output; gpuErrchk(cudaMalloc((void**)&d_output, N*sizeof(float)));
srand(time(NULL));
for (int i=0; i<n_loop*N; i++) input[i] = rand() / (float)RAND_MAX;
gpuErrchk(cudaMemcpy(d_input, input, n_loop*N*sizeof(float), cudaMemcpyHostToDevice));
// --- Host-side computations
for (int k = 0; k < N; k++) {
float accum1 = 0.f;
float accum2 = 0.f;
for (int i = 0; i < n_loop; i++) {
accum1 = accum1 + input[n_loop*k+i];
if (input[n_loop*k+i] < epsilon) accum2 = accum2 + input[n_loop*k+i];
}
output[k] = accum1;
outputif[k] = accum2;
}
// --- Device-side computation - kernel1
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
testKernel1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel1 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Check CPU and GPU results
for (int i=0; i<N; i++)
if (output[i] != output2[i]) {
printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
return 1;
}
printf("kernel1: results match!\n");
// --- Device-side computation - kernel2
cudaEventRecord(start, 0);
testKernel2<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel1 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Check CPU and GPU results
for (int i=0; i<N; i++)
if (outputif[i] != output2[i]) {
printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, outputif[i], output2[i]);
return 1;
}
printf("kernel2: results match!\n");
// --- Device-side computation - kernel3
cudaEventRecord(start, 0);
testKernel3<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel3 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Check CPU and GPU results
for (int i=0; i<N; i++)
if (output[i] != output2[i]) {
printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
return 1;
}
printf("kernel3: results match!\n");
// --- Device-side computation - kernel4
cudaEventRecord(start, 0);
testKernel4<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel4 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Check CPU and GPU results
for (int i=0; i<N; i++)
if (abs(output[i] - output2[i]) > 0.0001) {
printf("Mismatch at i = %d, Host= %f, Device = %f, difference = %f\n", i, output[i], output2[i], output2[i] - output[i]);
return 1;
}
printf("kernel4: results match!\n");
return 0;
}
现在让我们分析四种不同情况的反汇编代码(使用 CUDA 6.0 编译)。我正在考虑编译费米架构。
内核 1
MOV R1, c[0x1][0x100];
S2R R0, SR_CTAID.X;
IMUL R2, R0, c[0x0][0x8];
S2R R3, SR_TID.X;
IADD R0, R2, R3;
ISETP.GE.AND P0, PT, R0, c[0x0][0x28], PT;
@P0 BRA.U 0xd8;
@!P0 IADD R2, R3, R2;
@!P0 ISCADD R2, R2, c[0x0][0x20], 0x5;
@!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
@!P0 LD R9, [R2];
@!P0 LD R8, [R2+0x4];
@!P0 LD R7, [R2+0x8];
@!P0 LD R6, [R2+0xc];
@!P0 LD R5, [R2+0x10];
@!P0 LD R4, [R2+0x14];
@!P0 LD R3, [R2+0x18];
@!P0 LD R2, [R2+0x1c];
@!P0 F2F.F32.F32 R9, R9;
@!P0 FADD R8, R9, R8;
@!P0 FADD R7, R8, R7;
@!P0 FADD R6, R7, R6;
@!P0 FADD R5, R6, R5;
@!P0 FADD R4, R5, R4;
@!P0 FADD R3, R4, R3;
@!P0 FADD R2, R3, R2;
@!P0 ST [R0], R2;
EXIT;
在这种情况下,编译器将完全展开循环。您将看到8
不同的加载 ( LD
) 指令和7
不同的添加 ( FADD
) 指令。
内核 2
MOV R1, c[0x1][0x100];
S2R R0, SR_CTAID.X;
IMUL R0, R0, c[0x0][0x8];
S2R R2, SR_TID.X;
IADD R3, R0, R2;
ISETP.GE.AND P0, PT, R3, c[0x0][0x28], PT;
@P0 EXIT;
IADD R0, R2, R0;
ISCADD R9, R0, c[0x0][0x20], 0x5;
LD R0, [R9];
LD R2, [R9+0x4];
LD R4, [R9+0x8];
LD R5, [R9+0xc];
LD R6, [R9+0x10];
LD R7, [R9+0x14];
LD R8, [R9+0x18];
LD R9, [R9+0x1c];
FSETP.LT.AND P0, PT, R0, 0.5, PT;
FSETP.LT.AND P1, PT, R4, 0.5, PT;
F2F.F32.F32 R0, R0;
SEL R0, R0, RZ, P0;
FSETP.LT.AND P0, PT, R2, 0.5, PT;
@P0 FADD R0, R0, R2;
FSETP.LT.AND P0, PT, R5, 0.5, PT;
@P1 FADD R0, R0, R4;
@P0 FADD R0, R0, R5;
FSETP.LT.AND P1, PT, R8, 0.5, PT;
FSETP.LT.AND P0, PT, R6, 0.5, PT;
FADD R2, R0, R6;
SEL R2, R2, R0, P0;
FSETP.LT.AND P0, PT, R7, 0.5, PT;
ISCADD R0, R3, c[0x0][0x24], 0x2;
@P0 FADD R2, R2, R7;
FSETP.LT.AND P0, PT, R9, 0.5, PT;
@P1 FADD R2, R2, R8;
@P0 FADD R2, R2, R9;
ST [R0], R2;
EXIT;
同样在这种情况下,编译器正在完全展开循环。您将再次看到8
不同的加载 ( LD
) 指令和7
不同的添加 ( FADD
) 指令。
内核 3
c[0x0][0x30] = N
c[0x1][0x100] = BLOCKSIZE
c[0x0][0x8] = blockDim.x
c[0x0][0x30] = N_loop
c[0x0][0x20] = input
/*0000*/ MOV R1, c[0x1][0x100]; R1 = BLOCKSIZE = 256
/*0008*/ S2R R0, SR_CTAID.X; R0 = blockIdx.x
/*0010*/ S2R R2, SR_TID.X; R2 = threadIdx.x
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2; R0 = tid = blockIDx.x * blockDim.x + threadIdx.x
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x34], PT; P0 = (tid >= N) then EXIT
/*0028*/ @P0 EXIT;
/*0030*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x30], PT; P0 = (0 < N_loop)
/*0038*/ @P0 BRA 0x60;
/*0040*/ MOV R4, RZ;
/*0048*/ BRA 0x170;
/*0050*/ NOP;
/*0058*/ NOP;
/*0060*/ MOV R2, c[0x0][0x30]; R2 = N_loop
/*0068*/ IMUL R3, R0, c[0x0][0x30]; R3 = tid * N_loop
/*0070*/ MOV32I R6, 0x4; R6 = sizeof(float) = 4
/*0078*/ ISETP.GT.AND P0, PT, R2, 0x3, PT; P0 = (R2 >= 3)
/*0080*/ IMAD R2.CC, R3, R6, c[0x0][0x20]; R2 = R3 * R6 + input = tid * N_loop * 4 + input
/*0088*/ MOV R4, RZ; R4 = 0
/*0090*/ MOV R5, RZ; R5 = 0
/*0098*/ IMAD.HI.X R3, R3, R6, c[0x0][0x24];
/*00a0*/ @!P0 BRA 0x128;
/*00a8*/ MOV R6, c[0x0][0x30]; R6 = N_loop
/*00b0*/ IADD R10, R6, -0x3; R10 = N_loop - 3
/*00b8*/ NOP;
/*00c0*/ IADD R5, R5, 0x4; R5 = R5 + 4 = 4
/*00c8*/ LD.E R6, [R2]; R6 = input[tid * N_loop]
/*00d0*/ ISETP.LT.AND P0, PT, R5, R10, PT; P0 = (4 < (N_loop - 3))
/*00d8*/ LD.E R7, [R2+0x4]; R7 = input[tid * N_loop + 1]
/*00e0*/ LD.E R8, [R2+0x8]; R8 = input[tid * N_loop + 2]
/*00e8*/ LD.E R9, [R2+0xc]; R9 = input[tid * N_loop + 3]
/*00f0*/ IADD R2.CC, R2, 0x10; R2 = R2 + 16 = R2 + 4 * sizeof(float)
/*00f8*/ IADD.X R3, R3, RZ;
/*0100*/ FADD R6, R4, R6; R6 = 0 + input[tid * N_loop]
/*0108*/ FADD R4, R6, R7; R4 = input[tid * N_loop] + input[tid * N_loop + 1]
/*0110*/ FADD R8, R4, R8; R8 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2]
/*0118*/ FADD R4, R8, R9; R4 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2] + input[tid * N_loop + 3]
/*0120*/ @P0 BRA 0xc0; ...
/*0128*/ ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
/*0130*/ @!P0 BRA 0x170;
/*0138*/ IADD R5, R5, 0x1;
/*0140*/ LD.E R6, [R2];
/*0148*/ ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
/*0150*/ IADD R2.CC, R2, 0x4;
/*0158*/ IADD.X R3, R3, RZ;
/*0160*/ FADD R4, R4, R6;
/*0168*/ @P0 BRA 0x138;
/*0170*/ MOV32I R3, 0x4;
/*0178*/ IMAD R2.CC, R0, R3, c[0x0][0x28];
/*0180*/ IMAD.HI.X R3, R0, R3, c[0x0][0x2c];
/*0188*/ ST.E [R2], R4;
/*0190*/ EXIT;
可以看出,编译器自动执行循环展开4
,因为我看到4
加载操作(LD
)和3
不同的添加(FADD
)
内核 4
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ S2R R0, SR_CTAID.X;
/*0010*/ S2R R2, SR_TID.X;
/*0018*/ IMAD R13, R0, c[0x0][0x8], R2;
/*0020*/ ISETP.GE.AND P0, PT, R13, c[0x0][0x34], PT;
/*0028*/ @P0 EXIT;
/*0030*/ MOV R2, c[0x0][0x30];
/*0038*/ SHR R0, R2, 0x1f;
/*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*0048*/ IMAD.U32.U32.HI R0, R0, 0x4, R2;
/*0050*/ SHR R0, R0, 0x2;
/*0058*/ @P0 BRA 0x98;
/*0060*/ MOV R18, RZ;
/*0068*/ MOV R19, RZ;
/*0070*/ MOV R10, RZ;
/*0078*/ MOV R11, RZ;
/*0080*/ BRA 0x308;
/*0088*/ NOP;
/*0090*/ NOP;
/*0098*/ MOV R3, c[0x0][0x30];
/*00a0*/ IMUL R4, R13, c[0x0][0x30];
/*00a8*/ MOV32I R5, 0x4;
/*00b0*/ IMUL R2, R3, 0x3;
/*00b8*/ SHL R6, R3, 0x1;
/*00c0*/ IADD R10, R0, R4;
/*00c8*/ SHR R3, R2, 0x1f;
/*00d0*/ IMAD R8.CC, R4, R5, c[0x0][0x20];
/*00d8*/ SHR R7, R6, 0x1f;
/*00e0*/ IMAD.U32.U32.HI R2, R3, 0x4, R2;
/*00e8*/ IMAD.HI.X R9, R4, R5, c[0x0][0x24];
/*00f0*/ IMAD.U32.U32.HI R7, R7, 0x4, R6;
/*00f8*/ IMAD.HI R3, R2, c[0x10][0x0], R4;
/*0100*/ IMAD R6.CC, R10, R5, c[0x0][0x20];
/*0108*/ ISETP.GT.AND P0, PT, R0, 0x1, PT;
/*0110*/ IMAD.HI R14, R7, c[0x10][0x0], R4;
/*0118*/ MOV R18, RZ;
/*0120*/ IMAD.HI.X R7, R10, R5, c[0x0][0x24];
/*0128*/ MOV R19, RZ;
/*0130*/ IMAD R2.CC, R3, R5, c[0x0][0x20];
/*0138*/ MOV R10, RZ;
/*0140*/ IMAD.HI.X R3, R3, R5, c[0x0][0x24];
/*0148*/ MOV R11, RZ;
/*0150*/ IMAD R4.CC, R14, R5, c[0x0][0x20];
/*0158*/ MOV R12, RZ;
/*0160*/ IMAD.HI.X R5, R14, R5, c[0x0][0x24];
/*0168*/ @!P0 BRA 0x260;
/*0170*/ IADD R16, R0, -0x1;
/*0178*/ NOP;
/*0180*/ IADD R12, R12, 0x2;
/*0188*/ LD.E R15, [R8];
/*0190*/ ISETP.LT.AND P0, PT, R12, R16, PT;
/*0198*/ LD.E R20, [R6];
/*01a0*/ FADD R17, R18, R15;
/*01a8*/ LD.E R14, [R4];
/*01b0*/ FADD R19, R19, R20;
/*01b8*/ LD.E R15, [R2];
/*01c0*/ LD.E R18, [R8+0x4];
/*01c8*/ LD.E R20, [R6+0x4];
/*01d0*/ IADD R6.CC, R6, 0x8;
/*01d8*/ NOP;
/*01e0*/ FADD R14, R10, R14;
/*01e8*/ FADD R15, R11, R15;
/*01f0*/ IADD.X R7, R7, RZ;
/*01f8*/ LD.E R10, [R4+0x4];
/*0200*/ IADD R4.CC, R4, 0x8;
/*0208*/ LD.E R11, [R2+0x4];
/*0210*/ IADD.X R5, R5, RZ;
/*0218*/ FADD R18, R17, R18;
/*0220*/ IADD R2.CC, R2, 0x8;
/*0228*/ FADD R19, R19, R20;
/*0230*/ IADD.X R3, R3, RZ;
/*0238*/ IADD R8.CC, R8, 0x8;
/*0240*/ IADD.X R9, R9, RZ;
/*0248*/ FADD R10, R14, R10;
/*0250*/ FADD R11, R15, R11;
/*0258*/ @P0 BRA 0x180;
/*0260*/ ISETP.LT.AND P0, PT, R12, R0, PT;
/*0268*/ @!P0 BRA 0x308;
/*0270*/ IADD R12, R12, 0x1;
/*0278*/ LD.E R17, [R8];
/*0280*/ ISETP.LT.AND P0, PT, R12, R0, PT;
/*0288*/ LD.E R16, [R6];
/*0290*/ IADD R6.CC, R6, 0x4;
/*0298*/ LD.E R15, [R4];
/*02a0*/ IADD.X R7, R7, RZ;
/*02a8*/ LD.E R14, [R2];
/*02b0*/ IADD R4.CC, R4, 0x4;
/*02b8*/ IADD.X R5, R5, RZ;
/*02c0*/ IADD R2.CC, R2, 0x4;
/*02c8*/ IADD.X R3, R3, RZ;
/*02d0*/ IADD R8.CC, R8, 0x4;
/*02d8*/ IADD.X R9, R9, RZ;
/*02e0*/ FADD R18, R18, R17;
/*02e8*/ FADD R19, R19, R16;
/*02f0*/ FADD R10, R10, R15;
/*02f8*/ FADD R11, R11, R14;
/*0300*/ @P0 BRA 0x270;
/*0308*/ FADD R0, R18, R19;
/*0310*/ MOV32I R3, 0x4;
/*0318*/ FADD R0, R0, R10;
/*0320*/ IMAD R2.CC, R13, R3, c[0x0][0x28];
/*0328*/ FADD R0, R0, R11;
/*0330*/ IMAD.HI.X R3, R13, R3, c[0x0][0x2c];
/*0338*/ ST.E [R2], R0;
/*0340*/ EXIT;
在这种情况下,编译器会自动执行 的循环展开4
,这会叠加到 的手动循环展开4
,正如我看到的8
加载操作 ( LD
) 和7
不同的添加 ( FADD
)。
尽管反汇编代码与 Fermi 架构的代码不同,但 Kepler 架构的编译器行为也相似。
由于自动循环展开功能,不同内核之间的性能差异不大:
GT 210 (c.c. 1.2)
Kernel 1 = 111ms
Kernel 2 = 108ms
Kernel 3 = 107ms
Kernel 4 = 110ms
Kepler K20c (c.c. 3.5)
Kernel 1 = 1.8ms
Kernel 2 = 1.8ms
Kernel 3 = 1.8ms
Kernel 4 = 1.8ms
我并没有明确提供 Fermi 架构的结果,但是对于四个考虑的内核来说,时间大致相同。