visual-studio-2008 - 在内联ptx中加载函数参数

Question

我具有以下内联汇编功能，可在 32 位 Visual Studio 2008 的调试模式下正常工作：

__device__ void add(int* pa, int* pb)
{
  asm(".reg .u32   s<3>;"::);
  asm(".reg .u32   r<14>;"::);

  asm("ld.global.b32    s0, [%0];"::"r"(&pa));      //load addresses of pa, pb
  printf(...);
  asm("ld.global.b32    s1, [%0];"::"r"(&pb));
  printf(...);
  asm("ld.global.b32    r1, [s0+8];"::);
  printf(...);  
  asm("ld.global.b32    r2, [s1+8];"::);
  printf(...);

  ...// perform some operations
}

pa 和 pb 在设备上全局分配，例如

__device__ int pa[3] = {0, 0x927c0000, 0x20000011};  
__device__ int pb[3] = {0, 0xbb900000, 0x2000000b};

但是，此代码在发布模式下失败，在线asm("ld.global.b32 r1, [s0+8];"::); 如何在发布模式下使用内联 ptx 正确加载函数参数？

PS 使用 -G 标志（生成 GPU 调试信息）构建发布模式会导致代码在发布模式下正确运行。谢谢，

score 1 · Accepted Answer

希望这段代码会有所帮助。我仍在猜测您到底要做什么，但是我从您的代码开始，并决定在paandpb数组中添加一些值并将它们存储回pa[0]and pb[0]。

此代码是为 64 位机器编写的，但将其转换为 32 位指针应该不难。我已经用注释标记了需要为 32 位指针更改的行。希望这将回答您有关如何使用作为设备内存指针的函数参数的问题：

#include <stdio.h>

__device__ int pa[3] = {0, 0x927c0000, 0x20000011};
__device__ int pb[3] = {0, 0xbb900000, 0x2000000b};

__device__ void add(int* mpa, int* mpb)
{
  asm(".reg .u64   s<2>;"::);  // change to .u32 for 32 bit pointers
  asm(".reg .u32   r<6>;"::);

  asm("mov.u64    s0, %0;"::"l"(mpa));      //change to .u32 and "r" for 32 bit
  asm("mov.u64    s1, %0;"::"l"(mpb));      //change to .u32 and "r" for 32 bit
  asm("ld.global.u32    r0, [s0+4];"::);
  asm("ld.global.u32    r1, [s1+4];"::);
  asm("ld.global.u32    r2, [s0+8];"::);
  asm("ld.global.u32    r3, [s1+8];"::);
  asm("add.u32    r4, r0, r2;"::);
  asm("add.u32    r5, r1, r3;"::);
  asm("st.global.u32    [s0], r4;"::);
  asm("st.global.u32   [s1], r5;"::);
}

__global__ void mykernel(){
  printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]);
  add(pa, pb);
  printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]);
}

int  main() {
  mykernel<<<1,1>>>();
  cudaDeviceSynchronize();
  return 0;
}

当我运行此代码时，我得到：

$ ./t128
pa[0] = 0, pb[0] = 0
pa[0] = b27c0011, pb[0] = db90000b
$

我相信这是正确的输出。

我编译它：

nvcc -O3 -arch=sm_20 -o t128 t128.cu

visual-studio-2008 - 在内联ptx中加载函数参数

1 回答 1

Related

Reference