我有一些 NVVM 代码,我正在尝试使用 nvrtc 编译到 PTX(即使用 nvvmCompileProgram、nvvmGetCompiledResult)。
这是 nvvm 代码:
; ModuleID = 'test_warp_reduce'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"
define ptx_kernel void @lambda_crit_4197([0 x float]* %_4200_4590, [0 x i64]* %_4201_4591, [0 x float]* %_4202_4592) {
acc_bidx:
%0 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%1 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%2 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%3 = mul nsw i32 %2, %1
%4 = add nsw i32 %3, %0
%5 = icmp slt i32 %4, 32
br i1 %5, label %if_then12, label %next
if_then12: ; preds = %acc_bidx
%6 = getelementptr inbounds [0 x float]* %_4202_4592, i64 0, i32 %4
%7 = load float* %6
%8 = tail call i64 @clock()
%9 = tail call float @reduce_step(float %7, i32 1, i32 31)
%10 = tail call float @reduce_step(float %9, i32 2, i32 31)
%11 = tail call float @reduce_step(float %10, i32 4, i32 31)
%12 = tail call float @reduce_step(float %11, i32 8, i32 31)
%13 = tail call float @reduce_step(float %12, i32 16, i32 31)
%14 = tail call i64 @clock()
%15 = getelementptr inbounds [0 x float]* %_4200_4590, i64 0, i32 %4
%16 = getelementptr inbounds [0 x i64]* %_4201_4591, i64 0, i32 %0
%17 = sub nsw i64 %14, %8
store i64 %17, i64* %16
store float %13, float* %15
br label %next
next: ; preds = %acc_bidx, %if_then12
ret void
}
declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)
; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
define i64 @clock() {
%1 = call i64 asm "mov.u32 $0, %clock;", "=r" ()
ret i64 %1
}
define float @reduce_step(float %a, i32 %b, i32 %c) {
%1 = call float asm
"{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, $1, $2, $3;
@p add.f32 r0, r0, $1;
mov.f32 $0, r0;
}", "=f, f, r, r" (float %a, i32 %b, i32 %c)
ret float %1
}
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x float]*, [0 x i64]*, [0 x float]*)* @lambda_crit_4197, metadata !"kernel", i64 1}
这是生成的ptx代码:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324574
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_52
.address_size 64
// .globl lambda_crit_4197
.visible .entry lambda_crit_4197(
.param .u64 lambda_crit_4197_param_0,
.param .u64 lambda_crit_4197_param_1,
.param .u64 lambda_crit_4197_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<11>;
.reg .s32 %r<15>;
.reg .s64 %rd<13>;
ld.param.u64 %rd1, [lambda_crit_4197_param_0];
ld.param.u64 %rd2, [lambda_crit_4197_param_1];
ld.param.u64 %rd3, [lambda_crit_4197_param_2];
mov.u32 %r1, %tid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mad.lo.s32 %r2, %r3, %r4, %r1;
setp.gt.s32 %p1, %r2, 31;
@%p1 bra BB0_2;
cvta.to.global.u64 %rd4, %rd3;
mul.wide.s32 %rd5, %r2, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f2, [%rd6];
mov.u32 %r5, 1;
mov.u32 %r14, 31;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r5, %r14;
@p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
@p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
// inline asm
mov.u32 %r9, 4;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f3, %r9, %r14;
@p add.f32 r0, r0, %f3;
mov.f32 %f5, r0;
}
// inline asm
mov.u32 %r11, 8;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f5, %r11, %r14;
@p add.f32 r0, r0, %f5;
mov.f32 %f7, r0;
}
// inline asm
mov.u32 %r13, 16;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f7, %r13, %r14;
@p add.f32 r0, r0, %f7;
mov.f32 %f9, r0;
}
// inline asm
cvta.to.global.u64 %rd7, %rd1;
add.s64 %rd8, %rd7, %rd5;
cvta.to.global.u64 %rd9, %rd2;
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd9, %rd10;
mov.u64 %rd12, 0;
st.global.u64 [%rd11], %rd12;
st.global.f32 [%rd8], %f9;
BB0_2:
ret;
}
// .globl clock
.visible .func (.param .b64 func_retval0) clock(
)
{
.reg .s32 %r<2>;
.reg .s64 %rd<2>;
// inline asm
mov.u32 %r1, %clock;
// inline asm
cvt.u64.u32 %rd1, %r1;
st.param.b64 [func_retval0+0], %rd1;
ret;
}
// .globl reduce_step
.visible .func (.param .b32 func_retval0) reduce_step(
.param .b32 reduce_step_param_0,
.param .b32 reduce_step_param_1,
.param .b32 reduce_step_param_2
)
{
.reg .f32 %f<3>;
.reg .s32 %r<3>;
ld.param.f32 %f2, [reduce_step_param_0];
ld.param.u32 %r1, [reduce_step_param_1];
ld.param.u32 %r2, [reduce_step_param_2];
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r1, %r2;
@p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
似乎 nvvm 编译器只是出于神秘的原因消除了代码。例如,根本没有发出对时钟函数的调用。
我是否使用编译器优化对提供的代码没有影响。
有人告诉我,Cuda 7.5 在 Windows 上有一些类似的问题(未发出程序集)。所以我降级到7.0。但是,问题仍然存在。
任何线索为什么会这样?