4

我正在尝试使用 PAPI 库来计算缓存未命中数。缓存命中性能计数器在我的硬件上不可用,这就是为什么我试图确定没有缓存未命中的缓存命中。我正在尝试一些事情。我的代码的第一个版本是这样的:

  int numEvents = 2;

  long long values[2];

  int events[2] = {PAPI_L1_DCM, PAPI_L2_TCM};


 if (PAPI_start_counters(events, numEvents) != PAPI_OK )  // !=PAPI_OK

    printf("PAPI error: %d\n", 1);

 for(int i=0; i < arr_size; i++)
  {
    array[i].value = 1;

  }

_mm_mfence();

if ((ret1 = PAPI_read_counters(values, numEvents)) != PAPI_OK) {
   fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret1));
   exit(1);
}
miss1 = values[0];

_mm_mfence();

for(int i=0; i < arr_size; i++){
         array[i].value = array[i].value + 9; // (int) sum
}

_mm_mfence();

if ((ret2 = PAPI_read_counters(values, numEvents)) != PAPI_OK) {
    fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret2));
    exit(1);
}

miss2 = values[0];

printf("before flush miss_1 %lli, miss_2 %lli \n", miss1, miss2);

问题是这段代码应该给我缓存命中,所以 L1 缓存未命中应该非常低。但是我得到了miss_2的出乎意料的高结果。数组大小为 200 时,miss_2 接近 100。它没有给出任何有效的结果来判断它真的被命中,因为缓存未命中的次数很多。

我也尝试像这样重写它:

if (PAPI_start_counters(events, numEvents) != PAPI_OK )  // !=PAPI_OK

     printf("PAPI error: %d\n", 1);

for(int i=0; i < arr_size; i++){
         array[i].value = array[i].value + 9; // (int) sum
}

if ( PAPI_stop_counters(values, numEvents) != PAPI_OK)
   printf("PAPI error: 2\n");

printf("before flush miss %lli\n", values[0]);

但这给出了更糟糕的结果,miss_2 超过 200。有什么我做错了吗?它应该给出更精确的结果,但现在它做得很糟糕。或者我错过了一些东西。
我试过不带栅栏,我相信至少它们不会造成任何伤害。我真的很感激任何建议。

PAPI_read_counters 的缺点是开销很大,而且性能不是很好,但现在我不在乎性能,我想正确确定缓存命中。

虽然我也在考虑使用 RDMPC,但我还没有找到一个在不覆盖 _asm 函数的情况下使用它的示例。这真的是使用 rdpmc 的唯一方法吗?不存在我不必覆盖的已定义函数?

编辑: 为 PAPI_read 添加编译器代码

    ./prog6:     file format elf64-x86-64


Disassembly of section .init:

00000000000009c0 <_init>:
 9c0:   48 83 ec 08             sub    $0x8,%rsp
 9c4:   48 8b 05 1d 16 20 00    mov    0x20161d(%rip),%rax        # 201fe8 <__gmon_start__>
 9cb:   48 85 c0                test   %rax,%rax
 9ce:   74 02                   je     9d2 <_init+0x12>
 9d0:   ff d0                   callq  *%rax
 9d2:   48 83 c4 08             add    $0x8,%rsp
 9d6:   c3                      retq   

Disassembly of section .plt:

00000000000009e0 <.plt>:
 9e0:   ff 35 6a 15 20 00       pushq  0x20156a(%rip)        # 201f50 <_GLOBAL_OFFSET_TABLE_+0x8>
 9e6:   ff 25 6c 15 20 00       jmpq   *0x20156c(%rip)        # 201f58 <_GLOBAL_OFFSET_TABLE_+0x10>
 9ec:   0f 1f 40 00             nopl   0x0(%rax)

00000000000009f0 <puts@plt>:
 9f0:   ff 25 6a 15 20 00       jmpq   *0x20156a(%rip)        # 201f60 <puts@GLIBC_2.2.5>
 9f6:   68 00 00 00 00          pushq  $0x0
 9fb:   e9 e0 ff ff ff          jmpq   9e0 <.plt>

0000000000000a00 <clock_gettime@plt>:
 a00:   ff 25 62 15 20 00       jmpq   *0x201562(%rip)        # 201f68 <clock_gettime@GLIBC_2.17>
 a06:   68 01 00 00 00          pushq  $0x1
 a0b:   e9 d0 ff ff ff          jmpq   9e0 <.plt>

0000000000000a10 <getpid@plt>:
 a10:   ff 25 5a 15 20 00       jmpq   *0x20155a(%rip)        # 201f70 <getpid@GLIBC_2.2.5>
 a16:   68 02 00 00 00          pushq  $0x2
 a1b:   e9 c0 ff ff ff          jmpq   9e0 <.plt>

0000000000000a20 <__stack_chk_fail@plt>:
 a20:   ff 25 52 15 20 00       jmpq   *0x201552(%rip)        # 201f78 <__stack_chk_fail@GLIBC_2.4>
 a26:   68 03 00 00 00          pushq  $0x3
 a2b:   e9 b0 ff ff ff          jmpq   9e0 <.plt>

0000000000000a30 <PAPI_read_counters@plt>:
 a30:   ff 25 4a 15 20 00       jmpq   *0x20154a(%rip)        # 201f80 <PAPI_read_counters>
 a36:   68 04 00 00 00          pushq  $0x4
 a3b:   e9 a0 ff ff ff          jmpq   9e0 <.plt>

0000000000000a40 <sched_setaffinity@plt>:
 a40:   ff 25 42 15 20 00       jmpq   *0x201542(%rip)        # 201f88 <sched_setaffinity@GLIBC_2.3.4>
 a46:   68 05 00 00 00          pushq  $0x5
 a4b:   e9 90 ff ff ff          jmpq   9e0 <.plt>

0000000000000a50 <PAPI_start_counters@plt>:
 a50:   ff 25 3a 15 20 00       jmpq   *0x20153a(%rip)        # 201f90 <PAPI_start_counters>
 a56:   68 06 00 00 00          pushq  $0x6
 a5b:   e9 80 ff ff ff          jmpq   9e0 <.plt>

0000000000000a60 <PAPI_stop_counters@plt>:
 a60:   ff 25 32 15 20 00       jmpq   *0x201532(%rip)        # 201f98 <PAPI_stop_counters>
 a66:   68 07 00 00 00          pushq  $0x7
 a6b:   e9 70 ff ff ff          jmpq   9e0 <.plt>

0000000000000a70 <malloc@plt>:
 a70:   ff 25 2a 15 20 00       jmpq   *0x20152a(%rip)        # 201fa0 <malloc@GLIBC_2.2.5>
 a76:   68 08 00 00 00          pushq  $0x8
 a7b:   e9 60 ff ff ff          jmpq   9e0 <.plt>

0000000000000a80 <PAPI_strerror@plt>:
 a80:   ff 25 22 15 20 00       jmpq   *0x201522(%rip)        # 201fa8 <PAPI_strerror>
 a86:   68 09 00 00 00          pushq  $0x9
 a8b:   e9 50 ff ff ff          jmpq   9e0 <.plt>

0000000000000a90 <__printf_chk@plt>:
 a90:   ff 25 1a 15 20 00       jmpq   *0x20151a(%rip)        # 201fb0 <__printf_chk@GLIBC_2.3.4>
 a96:   68 0a 00 00 00          pushq  $0xa
 a9b:   e9 40 ff ff ff          jmpq   9e0 <.plt>

0000000000000aa0 <getrusage@plt>:
 aa0:   ff 25 12 15 20 00       jmpq   *0x201512(%rip)        # 201fb8 <getrusage@GLIBC_2.2.5>
 aa6:   68 0b 00 00 00          pushq  $0xb
 aab:   e9 30 ff ff ff          jmpq   9e0 <.plt>

0000000000000ab0 <exit@plt>:
 ab0:   ff 25 0a 15 20 00       jmpq   *0x20150a(%rip)        # 201fc0 <exit@GLIBC_2.2.5>
 ab6:   68 0c 00 00 00          pushq  $0xc
 abb:   e9 20 ff ff ff          jmpq   9e0 <.plt>

0000000000000ac0 <fwrite@plt>:
 ac0:   ff 25 02 15 20 00       jmpq   *0x201502(%rip)        # 201fc8 <fwrite@GLIBC_2.2.5>
 ac6:   68 0d 00 00 00          pushq  $0xd
 acb:   e9 10 ff ff ff          jmpq   9e0 <.plt>

0000000000000ad0 <__fprintf_chk@plt>:
 ad0:   ff 25 fa 14 20 00       jmpq   *0x2014fa(%rip)        # 201fd0 <__fprintf_chk@GLIBC_2.3.4>
 ad6:   68 0e 00 00 00          pushq  $0xe
 adb:   e9 00 ff ff ff          jmpq   9e0 <.plt>

Disassembly of section .plt.got:

0000000000000ae0 <__cxa_finalize@plt>:
 ae0:   ff 25 12 15 20 00       jmpq   *0x201512(%rip)        # 201ff8 <__cxa_finalize@GLIBC_2.2.5>
 ae6:   66 90                   xchg   %ax,%ax

Disassembly of section .text:

0000000000000af0 <main>:
     af0:   41 57                   push   %r15
     af2:   b9 0f 00 00 00          mov    $0xf,%ecx
     af7:   41 56                   push   %r14
     af9:   41 55                   push   %r13
     afb:   41 54                   push   %r12
     afd:   55                      push   %rbp
     afe:   53                      push   %rbx
     aff:   48 81 ec 78 01 00 00    sub    $0x178,%rsp
     b06:   64 48 8b 04 25 28 00    mov    %fs:0x28,%rax
     b0d:   00 00 
     b0f:   48 89 84 24 68 01 00    mov    %rax,0x168(%rsp)
     b16:   00 
     b17:   31 c0                   xor    %eax,%eax
     b19:   48 8d 9c 24 e0 00 00    lea    0xe0(%rsp),%rbx
     b20:   00 
     b21:   48 b8 00 00 00 80 07    movabs $0x8000000780000000,%rax
     b28:   00 00 80 
     b2b:   48 c7 84 24 e0 00 00    movq   $0x1,0xe0(%rsp)
     b32:   00 01 00 00 00 
     b37:   48 8d 53 08             lea    0x8(%rbx),%rdx
     b3b:   48 89 84 24 c8 00 00    mov    %rax,0xc8(%rsp)
     b42:   00 
     b43:   31 c0                   xor    %eax,%eax
     b45:   48 89 d7                mov    %rdx,%rdi
     b48:   f3 48 ab                rep stos %rax,%es:(%rdi)
     b4b:   e8 c0 fe ff ff          callq  a10 <getpid@plt>
     b50:   48 89 da                mov    %rbx,%rdx
     b53:   be 80 00 00 00          mov    $0x80,%esi
     b58:   89 c7                   mov    %eax,%edi
     b5a:   e8 e1 fe ff ff          callq  a40 <sched_setaffinity@plt>
     b5f:   85 c0                   test   %eax,%eax
     b61:   0f 85 17 03 00 00       jne    e7e <main+0x38e>
     b67:   0f ae f0                mfence 
     b6a:   48 8d 74 24 10          lea    0x10(%rsp),%rsi
     b6f:   bf 02 00 00 00          mov    $0x2,%edi
     b74:   0f ae f0                mfence 
     b77:   e8 84 fe ff ff          callq  a00 <clock_gettime@plt>
     b7c:   0f 31                   rdtsc  
     b7e:   bf 00 fa 00 00          mov    $0xfa00,%edi
     b83:   0f ae f0                mfence 
     b86:   48 c1 e2 20             shl    $0x20,%rdx
     b8a:   49 89 c6                mov    %rax,%r14
     b8d:   49 09 d6                or     %rdx,%r14
     b90:   e8 db fe ff ff          callq  a70 <malloc@plt>
     b95:   48 8d bc 24 c8 00 00    lea    0xc8(%rsp),%rdi
     b9c:   00 
     b9d:   be 02 00 00 00          mov    $0x2,%esi
     ba2:   49 89 c4                mov    %rax,%r12
     ba5:   e8 a6 fe ff ff          callq  a50 <PAPI_start_counters@plt>
     baa:   85 c0                   test   %eax,%eax
     bac:   0f 85 88 02 00 00       jne    e3a <main+0x34a>
     bb2:   4d 89 e7                mov    %r12,%r15
     bb5:   49 8d 84 24 00 fa 00    lea    0xfa00(%r12),%rax
     bbc:   00 
     bbd:   4c 89 e5                mov    %r12,%rbp
     bc0:   c7 45 00 01 00 00 00    movl   $0x1,0x0(%rbp)
     bc7:   48 83 c5 40             add    $0x40,%rbp
     bcb:   48 39 e8                cmp    %rbp,%rax
     bce:   75 f0                   jne    bc0 <main+0xd0>
     bd0:   4c 8d ac 24 d0 00 00    lea    0xd0(%rsp),%r13
     bd7:   00 
     bd8:   be 02 00 00 00          mov    $0x2,%esi
     bdd:   4c 89 ef                mov    %r13,%rdi
     be0:   e8 4b fe ff ff          callq  a30 <PAPI_read_counters@plt>
     be5:   85 c0                   test   %eax,%eax
     be7:   0f 85 b8 02 00 00       jne    ea5 <main+0x3b5>
     bed:   48 8b 84 24 d0 00 00    mov    0xd0(%rsp),%rax
     bf4:   00 
     bf5:   4c 89 e3                mov    %r12,%rbx
     bf8:   48 89 44 24 08          mov    %rax,0x8(%rsp)
     bfd:   0f 1f 00                nopl   (%rax)
     c00:   83 03 09                addl   $0x9,(%rbx)
     c03:   48 83 c3 40             add    $0x40,%rbx
     c07:   48 39 dd                cmp    %rbx,%rbp
     c0a:   75 f4                   jne    c00 <main+0x110>
     c0c:   31 d2                   xor    %edx,%edx
     c0e:   48 8d 35 88 04 00 00    lea    0x488(%rip),%rsi        # 109d <_IO_stdin_used+0x2d>
     c15:   bf 01 00 00 00          mov    $0x1,%edi
     c1a:   31 c0                   xor    %eax,%eax
     c1c:   e8 6f fe ff ff          callq  a90 <__printf_chk@plt>
     c21:   be 02 00 00 00          mov    $0x2,%esi
     c26:   4c 89 ef                mov    %r13,%rdi
     c29:   e8 02 fe ff ff          callq  a30 <PAPI_read_counters@plt>
     c2e:   85 c0                   test   %eax,%eax
     c30:   0f 85 6f 02 00 00       jne    ea5 <main+0x3b5>
     c36:   48 8b 8c 24 d0 00 00    mov    0xd0(%rsp),%rcx
     c3d:   00 
     c3e:   48 8b 54 24 08          mov    0x8(%rsp),%rdx
     c43:   48 8d 35 e6 04 00 00    lea    0x4e6(%rip),%rsi        # 1130 <_IO_stdin_used+0xc0>
     c4a:   31 c0                   xor    %eax,%eax
     c4c:   bf 01 00 00 00          mov    $0x1,%edi
     c51:   e8 3a fe ff ff          callq  a90 <__printf_chk@plt>
     c56:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
     c5d:   00 00 00 
     c60:   41 0f ae 3c 24          clflush (%r12)
     c65:   49 83 c4 40             add    $0x40,%r12
     c69:   49 39 dc                cmp    %rbx,%r12
     c6c:   75 f2                   jne    c60 <main+0x170>
     c6e:   be 02 00 00 00          mov    $0x2,%esi
     c73:   4c 89 ef                mov    %r13,%rdi
     c76:   e8 b5 fd ff ff          callq  a30 <PAPI_read_counters@plt>
     c7b:   85 c0                   test   %eax,%eax
     c7d:   0f 85 22 02 00 00       jne    ea5 <main+0x3b5>
     c83:   48 8b ac 24 d0 00 00    mov    0xd0(%rsp),%rbp
     c8a:   00 
     c8b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
     c90:   41 83 07 09             addl   $0x9,(%r15)
     c94:   49 83 c7 40             add    $0x40,%r15
     c98:   49 39 df                cmp    %rbx,%r15
     c9b:   75 f3                   jne    c90 <main+0x1a0>
     c9d:   be 02 00 00 00          mov    $0x2,%esi
     ca2:   4c 89 ef                mov    %r13,%rdi
     ca5:   e8 86 fd ff ff          callq  a30 <PAPI_read_counters@plt>
     caa:   85 c0                   test   %eax,%eax
     cac:   0f 85 f3 01 00 00       jne    ea5 <main+0x3b5>
     cb2:   48 8b 8c 24 d0 00 00    mov    0xd0(%rsp),%rcx
     cb9:   00 
     cba:   48 8d 35 97 04 00 00    lea    0x497(%rip),%rsi        # 1158 <_IO_stdin_used+0xe8>
     cc1:   bf 01 00 00 00          mov    $0x1,%edi
     cc6:   31 c0                   xor    %eax,%eax
     cc8:   48 89 ea                mov    %rbp,%rdx
     ccb:   e8 c0 fd ff ff          callq  a90 <__printf_chk@plt>
     cd0:   be 02 00 00 00          mov    $0x2,%esi
     cd5:   4c 89 ef                mov    %r13,%rdi
     cd8:   e8 83 fd ff ff          callq  a60 <PAPI_stop_counters@plt>
     cdd:   85 c0                   test   %eax,%eax
     cdf:   0f 85 72 01 00 00       jne    e57 <main+0x367>
     ce5:   0f ae f0                mfence 
     ce8:   0f 31                   rdtsc  
     cea:   bf 02 00 00 00          mov    $0x2,%edi
     cef:   48 c1 e2 20             shl    $0x20,%rdx
     cf3:   48 89 c3                mov    %rax,%rbx
     cf6:   48 8d 74 24 20          lea    0x20(%rsp),%rsi
     cfb:   48 09 d3                or     %rdx,%rbx
     cfe:   e8 fd fc ff ff          callq  a00 <clock_gettime@plt>
     d03:   bf 01 00 00 00          mov    $0x1,%edi
     d08:   48 be db 34 b6 d7 82    movabs $0x431bde82d7b634db,%rsi
     d0f:   de 1b 43 
     d12:   0f ae f0                mfence 
     d15:   48 8b 4c 24 20          mov    0x20(%rsp),%rcx
     d1a:   48 2b 4c 24 10          sub    0x10(%rsp),%rcx
     d1f:   48 69 c9 00 ca 9a 3b    imul   $0x3b9aca00,%rcx,%rcx
     d26:   48 03 4c 24 28          add    0x28(%rsp),%rcx
     d2b:   48 2b 4c 24 18          sub    0x18(%rsp),%rcx
     d30:   48 89 c8                mov    %rcx,%rax
     d33:   48 c1 f9 3f             sar    $0x3f,%rcx
     d37:   48 f7 ee                imul   %rsi
     d3a:   48 8d 35 3f 04 00 00    lea    0x43f(%rip),%rsi        # 1180 <_IO_stdin_used+0x110>
     d41:   31 c0                   xor    %eax,%eax
     d43:   48 c1 fa 12             sar    $0x12,%rdx
     d47:   48 29 ca                sub    %rcx,%rdx
     d4a:   e8 41 fd ff ff          callq  a90 <__printf_chk@plt>
     d4f:   48 89 da                mov    %rbx,%rdx
     d52:   bf 01 00 00 00          mov    $0x1,%edi
     d57:   31 c0                   xor    %eax,%eax
     d59:   4c 29 f2                sub    %r14,%rdx
     d5c:   48 8d 35 53 03 00 00    lea    0x353(%rip),%rsi        # 10b6 <_IO_stdin_used+0x46>
     d63:   e8 28 fd ff ff          callq  a90 <__printf_chk@plt>
     d68:   31 d2                   xor    %edx,%edx
     d6a:   48 8d 35 56 03 00 00    lea    0x356(%rip),%rsi        # 10c7 <_IO_stdin_used+0x57>
     d71:   31 c0                   xor    %eax,%eax
     d73:   bf 01 00 00 00          mov    $0x1,%edi
     d78:   e8 13 fd ff ff          callq  a90 <__printf_chk@plt>
     d7d:   31 ff                   xor    %edi,%edi
     d7f:   48 8d 74 24 30          lea    0x30(%rsp),%rsi
     d84:   e8 17 fd ff ff          callq  aa0 <getrusage@plt>
     d89:   83 f8 ff                cmp    $0xffffffff,%eax
     d8c:   0f 84 d6 00 00 00       je     e68 <main+0x378>
     d92:   48 8b 8c 24 b8 00 00    mov    0xb8(%rsp),%rcx
     d99:   00 
     d9a:   48 8b 94 24 b0 00 00    mov    0xb0(%rsp),%rdx
     da1:   00 
     da2:   48 8d 35 3e 03 00 00    lea    0x33e(%rip),%rsi        # 10e7 <_IO_stdin_used+0x77>
     da9:   31 c0                   xor    %eax,%eax
     dab:   bf 01 00 00 00          mov    $0x1,%edi
     db0:   e8 db fc ff ff          callq  a90 <__printf_chk@plt>
     db5:   c5 f9 57 c0             vxorpd %xmm0,%xmm0,%xmm0
     db9:   bf 01 00 00 00          mov    $0x1,%edi
     dbe:   c5 fb 10 0d 12 04 00    vmovsd 0x412(%rip),%xmm1        # 11d8 <_IO_stdin_used+0x168>
     dc5:   00 
     dc6:   48 69 44 24 30 40 42    imul   $0xf4240,0x30(%rsp),%rax
     dcd:   0f 00 
     dcf:   48 03 44 24 38          add    0x38(%rsp),%rax
     dd4:   48 8d 35 d5 03 00 00    lea    0x3d5(%rip),%rsi        # 11b0 <_IO_stdin_used+0x140>
     ddb:   c4 e1 fb 2a c0          vcvtsi2sd %rax,%xmm0,%xmm0
     de0:   48 69 54 24 40 40 42    imul   $0xf4240,0x40(%rsp),%rdx
     de7:   0f 00 
     de9:   48 03 54 24 48          add    0x48(%rsp),%rdx
     dee:   c5 fb 59 c1             vmulsd %xmm1,%xmm0,%xmm0
     df2:   c4 e1 fb 2c c0          vcvttsd2si %xmm0,%rax
     df7:   c5 f9 57 c0             vxorpd %xmm0,%xmm0,%xmm0
     dfb:   c4 e1 fb 2a c2          vcvtsi2sd %rdx,%xmm0,%xmm0
     e00:   c5 fb 59 c1             vmulsd %xmm1,%xmm0,%xmm0
     e04:   c4 e1 fb 2c d0          vcvttsd2si %xmm0,%rdx
     e09:   48 01 c2                add    %rax,%rdx
     e0c:   31 c0                   xor    %eax,%eax
     e0e:   e8 7d fc ff ff          callq  a90 <__printf_chk@plt>
     e13:   31 c0                   xor    %eax,%eax
     e15:   48 8b 8c 24 68 01 00    mov    0x168(%rsp),%rcx
     e1c:   00 
     e1d:   64 48 33 0c 25 28 00    xor    %fs:0x28,%rcx
     e24:   00 00 
     e26:   75 51                   jne    e79 <main+0x389>
     e28:   48 81 c4 78 01 00 00    add    $0x178,%rsp
     e2f:   5b                      pop    %rbx
     e30:   5d                      pop    %rbp
     e31:   41 5c                   pop    %r12
     e33:   41 5d                   pop    %r13
     e35:   41 5e                   pop    %r14
     e37:   41 5f                   pop    %r15
     e39:   c3                      retq   
     e3a:   ba 01 00 00 00          mov    $0x1,%edx
     e3f:   48 8d 35 47 02 00 00    lea    0x247(%rip),%rsi        # 108d <_IO_stdin_used+0x1d>
     e46:   bf 01 00 00 00          mov    $0x1,%edi
     e4b:   31 c0                   xor    %eax,%eax
     e4d:   e8 3e fc ff ff          callq  a90 <__printf_chk@plt>
     e52:   e9 5b fd ff ff          jmpq   bb2 <main+0xc2>
     e57:   48 8d 3d 4a 02 00 00    lea    0x24a(%rip),%rdi        # 10a8 <_IO_stdin_used+0x38>
     e5e:   e8 8d fb ff ff          callq  9f0 <puts@plt>
     e63:   e9 7d fe ff ff          jmpq   ce5 <main+0x1f5>
     e68:   48 8d 3d 62 02 00 00    lea    0x262(%rip),%rdi        # 10d1 <_IO_stdin_used+0x61>
     e6f:   e8 7c fb ff ff          callq  9f0 <puts@plt>
     e74:   e9 19 ff ff ff          jmpq   d92 <main+0x2a2>
     e79:   e8 a2 fb ff ff          callq  a20 <__stack_chk_fail@plt>
     e7e:   48 8b 0d 9b 11 20 00    mov    0x20119b(%rip),%rcx        # 202020 <stderr@@GLIBC_2.2.5>
     e85:   ba 18 00 00 00          mov    $0x18,%edx
     e8a:   be 01 00 00 00          mov    $0x1,%esi
     e8f:   48 8d 3d de 01 00 00    lea    0x1de(%rip),%rdi        # 1074 <_IO_stdin_used+0x4>
     e96:   e8 25 fc ff ff          callq  ac0 <fwrite@plt>
     e9b:   bf 01 00 00 00          mov    $0x1,%edi
     ea0:   e8 0b fc ff ff          callq  ab0 <exit@plt>
     ea5:   89 c7                   mov    %eax,%edi
     ea7:   e8 d4 fb ff ff          callq  a80 <PAPI_strerror@plt>
     eac:   48 8b 3d 6d 11 20 00    mov    0x20116d(%rip),%rdi        # 202020 <stderr@@GLIBC_2.2.5>
     eb3:   be 01 00 00 00          mov    $0x1,%esi
     eb8:   48 8d 15 49 02 00 00    lea    0x249(%rip),%rdx        # 1108 <_IO_stdin_used+0x98>
     ebf:   48 89 c1                mov    %rax,%rcx
     ec2:   31 c0                   xor    %eax,%eax
     ec4:   e8 07 fc ff ff          callq  ad0 <__fprintf_chk@plt>
     ec9:   bf 01 00 00 00          mov    $0x1,%edi
     ece:   e8 dd fb ff ff          callq  ab0 <exit@plt>
     ed3:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
     eda:   00 00 00 
     edd:   0f 1f 00                nopl   (%rax)

0000000000000ee0 <_start>:
     ee0:   31 ed                   xor    %ebp,%ebp
     ee2:   49 89 d1                mov    %rdx,%r9
     ee5:   5e                      pop    %rsi
     ee6:   48 89 e2                mov    %rsp,%rdx
     ee9:   48 83 e4 f0             and    $0xfffffffffffffff0,%rsp
     eed:   50                      push   %rax
     eee:   54                      push   %rsp
     eef:   4c 8d 05 6a 01 00 00    lea    0x16a(%rip),%r8        # 1060 <__libc_csu_fini>
     ef6:   48 8d 0d f3 00 00 00    lea    0xf3(%rip),%rcx        # ff0 <__libc_csu_init>
     efd:   48 8d 3d ec fb ff ff    lea    -0x414(%rip),%rdi        # af0 <main>
     f04:   ff 15 d6 10 20 00       callq  *0x2010d6(%rip)        # 201fe0 <__libc_start_main@GLIBC_2.2.5>
     f0a:   f4                      hlt    
     f0b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)

0000000000000f10 <deregister_tm_clones>:
     f10:   48 8d 3d f9 10 20 00    lea    0x2010f9(%rip),%rdi        # 202010 <__TMC_END__>
     f17:   55                      push   %rbp
     f18:   48 8d 05 f1 10 20 00    lea    0x2010f1(%rip),%rax        # 202010 <__TMC_END__>
     f1f:   48 39 f8                cmp    %rdi,%rax
     f22:   48 89 e5                mov    %rsp,%rbp
     f25:   74 19                   je     f40 <deregister_tm_clones+0x30>
     f27:   48 8b 05 aa 10 20 00    mov    0x2010aa(%rip),%rax        # 201fd8 <_ITM_deregisterTMCloneTable>
     f2e:   48 85 c0                test   %rax,%rax
     f31:   74 0d                   je     f40 <deregister_tm_clones+0x30>
     f33:   5d                      pop    %rbp
     f34:   ff e0                   jmpq   *%rax
     f36:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
     f3d:   00 00 00 
     f40:   5d                      pop    %rbp
     f41:   c3                      retq   
     f42:   0f 1f 40 00             nopl   0x0(%rax)
     f46:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
     f4d:   00 00 00 

0000000000000f50 <register_tm_clones>:
     f50:   48 8d 3d b9 10 20 00    lea    0x2010b9(%rip),%rdi        # 202010 <__TMC_END__>
     f57:   48 8d 35 b2 10 20 00    lea    0x2010b2(%rip),%rsi        # 202010 <__TMC_END__>
     f5e:   55                      push   %rbp
     f5f:   48 29 fe                sub    %rdi,%rsi
     f62:   48 89 e5                mov    %rsp,%rbp
     f65:   48 c1 fe 03             sar    $0x3,%rsi
     f69:   48 89 f0                mov    %rsi,%rax
     f6c:   48 c1 e8 3f             shr    $0x3f,%rax
     f70:   48 01 c6                add    %rax,%rsi
     f73:   48 d1 fe                sar    %rsi
     f76:   74 18                   je     f90 <register_tm_clones+0x40>
     f78:   48 8b 05 71 10 20 00    mov    0x201071(%rip),%rax        # 201ff0 <_ITM_registerTMCloneTable>
     f7f:   48 85 c0                test   %rax,%rax
     f82:   74 0c                   je     f90 <register_tm_clones+0x40>
     f84:   5d                      pop    %rbp
     f85:   ff e0                   jmpq   *%rax
     f87:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
     f8e:   00 00 
     f90:   5d                      pop    %rbp
     f91:   c3                      retq   
     f92:   0f 1f 40 00             nopl   0x0(%rax)
     f96:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
     f9d:   00 00 00 

0000000000000fa0 <__do_global_dtors_aux>:
     fa0:   80 3d 81 10 20 00 00    cmpb   $0x0,0x201081(%rip)        # 202028 <completed.7696>
     fa7:   75 2f                   jne    fd8 <__do_global_dtors_aux+0x38>
     fa9:   48 83 3d 47 10 20 00    cmpq   $0x0,0x201047(%rip)        # 201ff8 <__cxa_finalize@GLIBC_2.2.5>
     fb0:   00 
     fb1:   55                      push   %rbp
     fb2:   48 89 e5                mov    %rsp,%rbp
     fb5:   74 0c                   je     fc3 <__do_global_dtors_aux+0x23>
     fb7:   48 8b 3d 4a 10 20 00    mov    0x20104a(%rip),%rdi        # 202008 <__dso_handle>
     fbe:   e8 1d fb ff ff          callq  ae0 <__cxa_finalize@plt>
     fc3:   e8 48 ff ff ff          callq  f10 <deregister_tm_clones>
     fc8:   c6 05 59 10 20 00 01    movb   $0x1,0x201059(%rip)        # 202028 <completed.7696>
     fcf:   5d                      pop    %rbp
     fd0:   c3                      retq   
     fd1:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
     fd8:   f3 c3                   repz retq 
     fda:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000fe0 <frame_dummy>:
     fe0:   55                      push   %rbp
     fe1:   48 89 e5                mov    %rsp,%rbp
     fe4:   5d                      pop    %rbp
     fe5:   e9 66 ff ff ff          jmpq   f50 <register_tm_clones>
     fea:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000ff0 <__libc_csu_init>:
     ff0:   41 57                   push   %r15
     ff2:   41 56                   push   %r14
     ff4:   49 89 d7                mov    %rdx,%r15
     ff7:   41 55                   push   %r13
     ff9:   41 54                   push   %r12
     ffb:   4c 8d 25 36 0d 20 00    lea    0x200d36(%rip),%r12        # 201d38 <__frame_dummy_init_array_entry>
    1002:   55                      push   %rbp
    1003:   48 8d 2d 36 0d 20 00    lea    0x200d36(%rip),%rbp        # 201d40 <__init_array_end>
    100a:   53                      push   %rbx
    100b:   41 89 fd                mov    %edi,%r13d
    100e:   49 89 f6                mov    %rsi,%r14
    1011:   4c 29 e5                sub    %r12,%rbp
    1014:   48 83 ec 08             sub    $0x8,%rsp
    1018:   48 c1 fd 03             sar    $0x3,%rbp
    101c:   e8 9f f9 ff ff          callq  9c0 <_init>
    1021:   48 85 ed                test   %rbp,%rbp
    1024:   74 20                   je     1046 <__libc_csu_init+0x56>
    1026:   31 db                   xor    %ebx,%ebx
    1028:   0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
    102f:   00 
    1030:   4c 89 fa                mov    %r15,%rdx
    1033:   4c 89 f6                mov    %r14,%rsi
    1036:   44 89 ef                mov    %r13d,%edi
    1039:   41 ff 14 dc             callq  *(%r12,%rbx,8)
    103d:   48 83 c3 01             add    $0x1,%rbx
    1041:   48 39 dd                cmp    %rbx,%rbp
    1044:   75 ea                   jne    1030 <__libc_csu_init+0x40>
    1046:   48 83 c4 08             add    $0x8,%rsp
    104a:   5b                      pop    %rbx
    104b:   5d                      pop    %rbp
    104c:   41 5c                   pop    %r12
    104e:   41 5d                   pop    %r13
    1050:   41 5e                   pop    %r14
    1052:   41 5f                   pop    %r15
    1054:   c3                      retq   
    1055:   90                      nop
    1056:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
    105d:   00 00 00 

0000000000001060 <__libc_csu_fini>:
    1060:   f3 c3                   repz retq 

Disassembly of section .fini:

0000000000001064 <_fini>:
    1064:   48 83 ec 08             sub    $0x8,%rsp
    1068:   48 83 c4 08             add    $0x8,%rsp
    106c:   c3                      retq 

我的对象大小为 64,并且还添加了初始化:

typedef struct _object{
  int value;
  int pad_0;
  int * pad_2;
  int * pad_3;
  int * pad_4;
  int * pad_5;
  int * pad_6;
  int * pad_7;
  int * pad_8;
} object;  

object * array;
int arr_size = 1000;
array = (object *) malloc(arr_size * sizeof(object));
for(int i=0; i < arr_size; i++){
      array[i].value = 1;
    }
4

1 回答 1

3

我在 Haswell 上使用类似于 PAPI 的 LIKWID 做了一些实验。我发现对初始化和读取性能计数器的函数的调用会导致 L1 缓存中的替换次数超过 600 次。由于 L1 缓存只有 512 行,这意味着这些函数可能会驱逐许多您原本期望在 L1 中的行。通过查看PAPI_start_counters_internal_hl_read_cnts的相对较大的源代码,在我看来,这些函数可能会从 L1 中逐出许多行,因此数组元素在这些调用中无法在 L1 中存在。我已经通过使用负载而不是存储并使用MEM_LOAD_RETIRED.*. 我认为解决方案是使用RDPMC操作说明。我以前没有直接使用过这个指令。这里的代码片段看起来很有用。

PAPI_start_counters或者,您可以在/之后放置两个循环副本,PAPI_read_counters然后从结果中减去一个循环副本的计数。这种方法效果很好。

顺便说一句,L1D.REPLACEMENT当访问的缓存行数大约大于 10 时,Haswell 上的计数器似乎相当准确。也许使用RDPMC.


从您之前的问题来看,您似乎在 Skylake 上。根据PAPI 事件映射PAPI_L1_DCMPAPI_L2_TCM映射到Intel 处理器上L1D.REPLACEMENTLONGEST_LAT_CACHE.REFERENCE性能监控事件。这些在英特尔手册中定义如下:

L1D.REPLACEMENT:计算 L1D 数据线替换,包括机会替换,以及需要停止替换或替换块的替换。

LONGEST_LAT_CACHE.REFERENCE:此事件对引用最后一级缓存 (LLC) 的核心发起的可缓存需求请求进行计数。需求请求包括来自 L1D 的加载、RFO 和硬件预取,以及来自 IFU 的指令取指。

在不详细了解这些事件何时发生的情况下,这里有三个与您的问题相关的要点:

  • 这两个事件都以缓存行粒度计算,而不是 x86 指令或加载 uop 粒度。
  • 这些事件可能由于 L1D 硬件预取器而发生。这会影响miss2.
  • 无法使用这些事件(或基于 SnB 的微架构上的任何其他事件集)以缓存行粒度计算特定物理或逻辑内核的 L1D 命中。

在 Skylake 上,您可以使用其他原生事件来计算每个加载指令的 L1D 未命中和命中。您可以使用MEM_LOAD_RETIRED.L1_HIT来计算在 L1D 中命中的已停用加载指令的数量。您可以使用MEM_INST_RETIRED.ALL_LOADS-MEM_LOAD_RETIRED.L1_HIT来计算 L1D 中丢失的已停用加载指令的数量。他们似乎没有 PAPI 事件。根据文档,您可以将本机事件代码传递给PAPIF_start_counters.

另一个问题是我不清楚PAPIF_start_counters默认情况下是否只计算内核和用户事件的用户事件。看来你可以用PAPI_create_eventset计数来控制。

对 PAPI API 的调用也会影响事件计数。您可以尝试使用一个空块来测量它,如下所示:

if ((ret1 = PAPI_read_counters(values, numEvents)) != PAPI_OK) {
   fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret1));
   exit(1);
}

// Nothing.

if ((ret2 = PAPI_read_counters(values, numEvents)) != PAPI_OK) {
    fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret2));
    exit(1);
}

此测量将为您估计由于 PAPI 本身可能发生的错误。

另外,我认为您不需要使用_mm_mfence.

于 2019-02-12T07:52:50.117 回答