我使用 NVIDIA Visual Profiler 来分析我的代码。测试内核是:
//////////////////////////////////////////////////////////////// Group 1
static __global__ void gpu_test_divergency_0(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
static __global__ void gpu_test_divergency_1(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
static __global__ void gpu_test_divergency_2(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
static __global__ void gpu_test_divergency_3(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
//////////////////////////////////////////////////////////////// Group 2
static __global__ void gpu_test_divergency_4(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_5(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_6(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_7(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
//////////////////////////////////////////////////////////////// Group 3
static __global__ void gpu_test_divergency_8(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_9(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_10(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_11(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
当我使用 <<< 1, 32 >>> 启动测试内核时,我从分析器中得到如下结果:
gpu_test_divergency_0 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_1 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_2 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_3 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_4 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_5 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_6 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_7 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_8 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_9 : Branch Efficiency = 75% branch = 4 divergent branch = 1
gpu_test_divergency_10 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_11 : Branch Efficiency = 75% branch = 4 divergent branch = 1
当我使用 <<< 1, 64 >>> 启动测试内核时,我从分析器中得到如下结果:
gpu_test_divergency_0 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_1 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_2 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_3 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_4 : Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_5 : Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_6 : Branch Efficiency = 100% branch = 4 divergent branch = 0
gpu_test_divergency_7 : Branch Efficiency = 100% branch = 5 divergent branch = 0
gpu_test_divergency_8 : Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_9 : Branch Efficiency = 85.7% branch = 7 divergent branch = 1
gpu_test_divergency_10 : Branch Efficiency = 100% branch = 4 divergent branch = 0
gpu_test_divergency_11 : Branch Efficiency = 83.3% branch = 6 divergent branch = 1
我在 Linux 上使用具有 2.0 的 CUDA 功能和 NVIDIA Visual Profiler v4.2 的“GeForce GTX 570”。根据文件:
“branch” - “执行内核的线程所采用的分支数。如果 warp 中至少有一个线程采用该分支,则此计数器将加一。”
“发散分支”-“经线中发散分支的数量。如果经线中的至少一个步幅通过数据相关的条件分支发散(即遵循不同的执行路径),则此计数器将加一。”
但我真的对结果感到困惑。为什么每个测试组的“分支”数量不同?为什么只有第三个测试组似乎有正确的“分歧分支”?
@JackOLantern:我在发布模式下编译。我按照你的方法拆开了。“gpu_test_divergency_4”的结果与你的完全相同,但“gpu_test_divergency_0”的结果不同:
Function : _Z21gpu_test_divergency_0PfS_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x94001c042c000000*/ S2R R0, SR_CTAid_X;
/*0010*/ /*0x84009c042c000000*/ S2R R2, SR_Tid_X;
/*0018*/ /*0x20009ca320044000*/ IMAD R2, R0, c [0x0] [0x8], R2;
/*0020*/ /*0xfc21dc23188e0000*/ ISETP.LT.AND P0, pt, R2, RZ, pt;
/*0028*/ /*0x0920de0418000000*/ I2F.F32.S32 R3, R2;
/*0030*/ /*0x9020204340004000*/ @!P0 ISCADD R0, R2, c [0x0] [0x24], 0x2;
/*0038*/ /*0x8020804340004000*/ @P0 ISCADD R2, R2, c [0x0] [0x20], 0x2;
/*0040*/ /*0x0000e08590000000*/ @!P0 ST [R0], R3;
/*0048*/ /*0x0020c08590000000*/ @P0 ST [R2], R3;
/*0050*/ /*0x00001de780000000*/ EXIT;
我想,就像你说的那样,转换指令(在这种情况下为 I2F)不会添加额外的分支。
但是我看不到这些反汇编代码和 Profiler 结果之间的关系。我从另一篇文章(https://devtalk.nvidia.com/default/topic/463316/branch-divergent-branches/)中了解到,发散分支是根据 SM 上的实际线程(warp)运行情况计算的。所以我想我们不能仅仅根据这些反汇编代码推断出每次实际运行的分支分歧。我对吗?