cuda - 在 cuda 中预取（通过 C 代码）

Question

我正在通过 C 代码在 CUDA (Fermi GPU) 中进行数据预取。Cuda 参考手册讨论了 ptx 级代码而不是 C 级代码的预取。

谁能将我与一些文件或有关通过 cuda 代码（cu 文件）预取的东西联系起来。任何帮助，将不胜感激。

score 8 · Accepted Answer

根据PTX 手册，这里是预取在 PTX 中的工作方式：

在此处输入图像描述

您可以将 PTX 指令嵌入到 CUDA 内核中。这是NVIDIA 文档中的一个小样本：

__device__ int cube (int x)
{
  int y;
  asm("{\n\t"                       // use braces for local scope
      " .reg .u32 t1;\n\t"           // temp reg t1,
      " mul.lo.u32 t1, %1, %1;\n\t" // t1 = x * x
      " mul.lo.u32 %0, t1, %1;\n\t" // y = t1 * x
      "}"
      : "=r"(y) : "r" (x));
  return y;
}

您可能会以 C 中的以下预取函数得出结论：

__device__ void prefetch_l1 (unsigned int addr)
{

  asm(" prefetch.global.L1 [ %1 ];": "=r"(addr) : "r"(addr));
}

注意：您需要 Compute Capability 2.0 或更高版本的 GPU 进行预取。相应地传递正确的编译标志-arch=sm_20

score 2 · Accepted Answer

根据这个线程，下面是不同缓存预取技术的代码：

#define DEVICE_STATIC_INTRINSIC_QUALIFIERS  static __device__ __forceinline__

#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
#define PXL_GLOBAL_PTR   "l"
#else
#define PXL_GLOBAL_PTR   "r"
#endif

DEVICE_STATIC_INTRINSIC_QUALIFIERS void __prefetch_global_l1(const void* const ptr)
{
  asm("prefetch.global.L1 [%0];" : : PXL_GLOBAL_PTR(ptr));
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS void __prefetch_global_uniform(const void* const ptr)
{
  asm("prefetchu.L1 [%0];" : : PXL_GLOBAL_PTR(ptr));
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS void __prefetch_global_l2(const void* const ptr)
{
  asm("prefetch.global.L2 [%0];" : : PXL_GLOBAL_PTR(ptr));
}

cuda - 在 cuda 中预取（通过 C 代码）

2 回答 2

Related

Reference