1

我正在尝试在 AMD GPU 中使用 GDS,但我无法使其工作。我的 GPU 是 AMD RX580。

我使用了这个 OpenCL 内核:

__kernel __attribute__((reqd_work_group_size(64, 1, 1)))

void localVarExample(__global int *res)
{
  int i = get_global_id(0);
  __local int x;
  if (i == 0) {
    x = 0;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  atom_inc(&x);
  barrier(CLK_LOCAL_MEM_FENCE);
  if (i == 0) {
    *res = x;
  }
} 

用(CLRX)构建和拆卸。之后,我在 DS 指令中添加了 gds 标志并修改了 M0 寄存器。

所以我得到了这个 ASM 内核:

/* Disassembling 'barrier-Ellesmere.bin' */
.amdcl2
.gpu Iceland
.64bit
.arch_minor 0
.arch_stepping 4
.driver_version 203603
.compile_options "-fno-bin-source -fno-bin-llvmir -fno-bin-amdil -fbin-exe -D__AMD__=1 -D__Ellesmere__=1 -D__Ellesmere=1 -D__IMAGE_SUPPORT__=1 -DFP_FAST_FMA=1 -cl-denorms-are-zero -m64 -Dcl_khr_fp64=1 -Dcl_amd_fp64=1 -Dcl_khr_global_int32_base_atomics=1 -Dcl_khr_global_int32_extended_atomics=1 -Dcl_khr_local_int32_base_atomics=1 -Dcl_khr_local_int32_extended_atomics=1 -Dcl_khr_int64_base_atomics=1 -Dcl_khr_int64_extended_atomics=1 -Dcl_khr_3d_image_writes=1 -Dcl_khr_byte_addressable_store=1 -Dcl_khr_fp16=1 -Dcl_khr_gl_sharing=1 -Dcl_khr_gl_depth_images=1 -Dcl_amd_device_attribute_query=1 -Dcl_amd_vec3=1 -Dcl_amd_printf=1 -Dcl_amd_media_ops=1 -Dcl_amd_media_ops2=1 -Dcl_amd_popcnt=1 -Dcl_khr_d3d10_sharing=1 -Dcl_khr_d3d11_sharing=1 -Dcl_khr_dx9_media_sharing=1 -Dcl_khr_image2d_from_buffer=1 -Dcl_khr_spir=1 -Dcl_khr_subgroups=1 -Dcl_khr_gl_event=1 -Dcl_khr_depth_images=1 -Dcl_khr_mipmap_image=1 -Dcl_khr_mipmap_image_writes=1 -Dcl_amd_liquid_flash=1 -Dcl_amd_planar_yuv=1"
.acl_version "AMD-COMP-LIB-v0.8 (0.0.SC_BUILD_NUMBER)"
.kernel localVarExample
    .config
        .dims x
        .cws 64, 1, 1
        .sgprsnum 13
        .vgprsnum 3
        .localsize 4
        .floatmode 0xc0
        .gdssize 0x1000
        .pgmrsrc1 0x00ac0040
        .pgmrsrc2 0x0000008c
        .dx10clamp
        .ieeemode
        .useargs
        .priority 0
        .arg _.global_offset_0, "size_t", long
        .arg _.global_offset_1, "size_t", long
        .arg _.global_offset_2, "size_t", long
        .arg _.printf_buffer, "size_t", void*, global, , rdonly
        .arg _.vqueue_pointer, "size_t", long
        .arg _.aqlwrap_pointer, "size_t", long
        .arg res, "int*", int*, global, 
    .text
        s_mov_b32       s7, 0x1000
        s_mov_b32       m0, s7                          # Setup M0 register with GDS size == 0x1000.
        s_lshl_b32      s0, s6, 6                       # #s0 = s6 * 64 (== group_id * local_size).
        s_load_dwordx2  s[2:3], s[4:5], 0x0             # Load global_offset_0 into s[2:3].
        s_waitcnt       lgkmcnt(0)
        s_add_u32       s0, s0, s2                      # s0 += s2. Now s0 = group_id * local_size + global_offset_0.
        v_add_u32       v0, vcc, s0, v0                 # v0 += s0. Now v0 = get_global_id(0).
        v_cmp_lg_i32    s[0:1], v0, 0                   # | 
        s_mov_b64       s[2:3], exec                    # |  if (get_global_id(0) == 0)
        s_andn2_b64     exec, s[2:3], s[0:1]            # |
        v_mov_b32       v0, 0x0
        s_cbranch_execz .L64_0
        ds_write_b32    v0, v0 gds                      # write 0 to the first uint32 in GDS (only the first thread do this).
.L64_0:
        s_mov_b64       exec, s[2:3]
        v_mov_b32       v0, 0
        v_mov_b32       v1, 1
        s_waitcnt       lgkmcnt(0)
        ds_add_u32      v0, v1 gds                      # Atomic increment the first uint32 in GDS  (v0 is address, v1 is value).
        s_waitcnt       lgkmcnt(0)                      # wait for ds_add_u32
        s_andn2_b64     exec, s[2:3], s[0:1]            # |  if (get_global_id(0) == 0)  reuse SGPR from previous if.
        v_mov_b32       v0, 0
        s_cbranch_execz .L140_0                         # Finish all threads exept by the first one .
        s_load_dwordx2  s[0:1], s[4:5], 0x30            # Load res argument into s[0:1]. s[0:1] == &res[0] now.
        ds_read_b32     v0, v0 gds                      # Read from the first uint32 in GDS into v0.
        s_waitcnt       lgkmcnt(0)                      # Wait for ds_add_u32.
        v_mov_b32       v1, s0                            
        v_mov_b32       v2, s1
        flat_store_dword v[1:2], v0                     # Save v0 into res[0]
.L140_0:
        s_endpgm

我在主机程序中用 0 填充 res 。预期的结果是res[0] == 64。带有 LDS 的 OpenCL 内核就是这样工作的。但在 GDS 版本res[0] == 0中。

4

0 回答 0