caching - 为什么非连续加载更快，即使缓存未命中惩罚保证为零？

Question

背景：

我用 C 编写了一个函数并用arm-none-eabi-gcc(7-2018-q2-update) 编译它。为循环体生成的汇编代码看起来每次迭代需要 20 个周期，包括 2 个等待状态，用于加载操作从非易失性程序存储器访问常量数据。

但是，我的 MCU 的 NVM 控制器缓存说缓存未命中惩罚保证为零，所以我不确定为什么它不能为两个 NVM 加载操作预取数据。因此，我认为循环每次迭代应该花费 18 个周期。

不幸的是，测量的性能与预期的性能有很大的不同。如果我改变了int8_t increment并且int16_t patch_data_i两者都是int32_t，那么 GCC 会以稍微不同的顺序有效地生成相同的指令。我们称这个版本为 (b)。

有趣的是，版本 (a) 每次迭代需要 21 个周期，而版本 (b) 每次迭代需要 20 个周期！

这种性能差异是高度可重复的。我通过改变版本 (a) 和版本 (b) 的 (5, 6, 7, 8) 之间的迭代次数来非常精确地测量它。泰克 465 示波器在固定 B 扫描设置下的定时测量：

T(a)[min, max, avg] = (20.0, 21.0, 20.3) c @ 48 MHz.
T(b)[min, max, avg] = (21.0, 22.0, 21.3) c @ 48 MHz.

（这个循环体的性能至关重要，因为它执行 8 次迭代，并且每 2000 个时钟周期调用一次这个函数。对于我的应用程序，即使这个单周期差异也大约占总 CPU 时间的 0.5%。）

我的问题有 4 个部分：

这里发生了什么？
为什么版本 (a) 需要 21 个周期而版本 (b) 需要 20 个周期？
为什么两个版本都不占用 18 个周期？
除了尝试组装操作的随机排列和测量示波器上的所有内容之外，是否有任何可能的方法来准确预测 Atmel SAMD21 微控制器上 RAM 和 NVM 的访问延迟？

（对这 4 部分中的任何 1 部分的答案将不胜感激。）

源代码（版本一）

__attribute__((used))
void enc_calc_transition(struct enc *enc, uint16_t old_state, uint16_t
                         new_state)
{
    uint32_t transitions = enc_interleave_states(old_state, new_state);
    size_t j = 0;
    for (size_t i = 0; i < 8; i++, j += 4) {
        const size_t transition = (transitions >> j) & 0xf;
        const int8_t increment = enc_increment_for_transition[transition];
        int16_t patch_data_i = enc->patch_data[i];
        patch_data_i += increment;
        size_t patch_data_i_plus_one = patch_data_i + 1;
        patch_data_i = enc_constrain_8x258[patch_data_i_plus_one];
        enc->patch_data[i] = patch_data_i;
    }
}

源代码（版本 b）

__attribute__((used))
void enc_calc_transition(struct enc *enc, uint16_t old_state, uint16_t
                         new_state)
{
    uint32_t transitions = enc_interleave_states(old_state, new_state);
    size_t j = 0;
    for (size_t i = 0; i < 8; i++, j += 4) {
        const size_t transition = (transitions >> j) & 0xf;
        const int32_t increment = enc_increment_for_transition[transition];
        int32_t patch_data_i    = enc->patch_data[i];
        patch_data_i += increment;
        size_t patch_data_i_plus_one = patch_data_i + 1;
        patch_data_i       = enc_constrain_8x258[patch_data_i_plus_one];
        enc->patch_data[i] = patch_data_i;
    }
}

生成的程序集（版本 a）

cyc addr    code        instr   fields

x   894e:   2200        movs    r2, #0
x   8950:   250f        movs    r5, #15
x   8952:   4f09        ldr     r7, [pc, #36] ; (8978)
x   8954:   4e09        ldr     r6, [pc, #36] ; (897c)
x   8956:   4460        add     r0, ip

1   8958:   000b        movs    r3, r1
1   895a:   40d3        lsrs    r3, r2
1   895c:   402b        ands    r3, r5
2   895e:   7804        ldrb    r4, [r0, #0]
2   8960:   56f3        ldrsb   r3, [r6, r3]
1   8962:   3204        adds    r2, #4
1   8964:   191b        adds    r3, r3, r4
1   8966:   18fb        adds    r3, r7, r3
2   8968:   785b        ldrb    r3, [r3, #1]
2   896a:   7003        strb    r3, [r0, #0]
1   896c:   3001        adds    r0, #1
1   896e:   2a20        cmp     r2, #32
2   8970:   d1f2        bne.n   8958 <enc_calc_transition+0x38>
18

x   8972:   bdf0        pop     {r4, r5, r6, r7, pc}
x   8974:   000090a8 ; <enc_expand_16x256>
x   8978:   00008fa4 ; <enc_constrain_8x258>
x   897c:   00008f94 ; <enc_increment_for_transition> [signed, 8x16]

instruction cycles:

movs lsrs ands ldrb ldrsb adds adds adds ldrb strb adds cmp bne
= 1 + 1 + 1 + 2 + 2 + 1 + 1 + 1 + 2 + 2 + 1 + 1 + 2
= 18

生成的程序集（版本 b）

cyc addr    code        instr   fields

x   894e:   2200        movs    r2, #0
x   8950:   250f        movs    r5, #15
x   8952:   4f09        ldr     r7, [pc, #36] ; (8978)
x   8954:   4e09        ldr     r6, [pc, #36] ; (897c)
x   8956:   4460        add     r0, ip

1   8958:   0021        movs    r1, r4
1   895a:   40d1        lsrs    r1, r2
2   895c:   7803        ldrb    r3, [r0, #0]
1   895e:   4029        ands    r1, r5
2   8960:   5671        ldrsb   r1, [r6, r1]
1   8962:   18fb        adds    r3, r7, r3
1   8964:   185b        adds    r3, r3, r1
2   8966:   785b        ldrb    r3, [r3, #1]
1   8968:   3204        adds    r2, #4
2   896a:   7003        strb    r3, [r0, #0]
1   896c:   3001        adds    r0, #1
1   896e:   2a20        cmp     r2, #32
2   8970:   d1f2        bne.n   8958
18

x   8972:   bdf0        pop     {r4, r5, r6, r7, pc}
x   8974:   000090a8 ; <enc_expand_16x256>
x   8978:   00008fa4 ; <enc_constrain_8x258>
x   897c:   00008f94 ; <enc_increment_for_transition> [signed, 8x16]

instruction cycles:

movs lsrs ldrb ands ldrsb adds adds ldrb adds strb adds cmp bne
= 1 + 1 + 2 + 1 + 2 + 1 + 1 + 2 + 1 + 2 + 1 + 1 + 2
= 18

我对生成程序集的解释（版本 a）

我已经为每种情况写出了我对生成的程序集的“解释”。这部分可能是不必要的，但我认为它也可以包括它，因为它帮助我理解（a）和（b）之间的区别。如上所述，循环前后的部分是相同的。我能看到的唯一显着区别是两个版本以稍微不同的顺序执行相同的指令。特别是，版本 (b)（每次迭代需要 20 个周期）具有零个连续加载/存储操作实例、零个连续加载/加载操作实例和零个连续存储/存储操作实例。（记录在案的每个加载操作的等待状态数在括号中注释：1 个等待状态将由表示// ^ ldrb [1]。）

r2 size_t j = 0;
r5 uint32_t mask_0xf = 0xf;
r7 uint8_t *constrain = &enc_constrain_8x258[0]; // 0x8fa4
r6 uint8_t *increment_for_transition =
    &enc_increment_for_transition[0]; // 0x8f94
r0 uint8_t *patch_data = &enc->patch_data[0]

do {
    r3 uint32_t _transitions = transitions;
    r3 uint32_t transitions_shifted = _transitions >> j;

    r3 size_t transition = transitions_shifted & mask_0xf;
    r4 int16_t patch_data_i = *(patch_data + 0); //
        // ^ ldrb [0]
    r3 int8_t _increment = *(increment_for_transition + transition);
        // ^ ldrsb [1]

    j += 4;
    r3 int16_t inc_plus_pdata = _increment + patch_data_i;
    r3 uint8_t *constrain_plus_inc_plus_pdata =
        constrain + inc_plus_pdata;
    r3 uint8_t constrained_pdata = *(constrain_plus_inc_plus_pdata + 1);
        // ^ ldr [1]

    *(patch_data + 0) = constrained_pdata;
        // ^ strb [0]
    patch_data++;
} while (j < 32);

我对生成程序集的解释（版本 b）

r2 size_t j = 0;
r5 uint32_t mask_0xf = 0xf;
r7 uint8_t *constrain = &enc_constrain_8x258[0]; // 0x8fa4
r6 uint8_t *increment_for_transition =
    &enc_increment_for_transition[0]; // 0x8f94
r0 uint8_t *patch_data = &enc->patch_data[0]

do {
    r1 uint32_t _transitions = transitions;
    r1 uint32_t transitions_shifted = _transitions >> j;

    r3 int32_t patch_data_i = *(patch_data + 0);
        // ^ ldrb [0]
    r1 size_t transition = transitions_shifted & mask_0xf;
    r1 int32_t _increment = *(increment_for_transition + transition);
        // ^ ldrsb [1]

    r3 uint8_t *constrain_plus_pdata = constrain + patch_data_i;
    r3 uint8_t *constrain_plus_pdata_plus_inc =
        constrain_plus_pdata + _increment;
    r3 uint8_t constrained_pdata = *(constrain_plus_pdata_plus_inc + 1);
        // ^ ldr [1]
    j += 4;

    *(patch_data + 0) = constrained_pdata;
        // ^ strb [0]
    patch_data++;
} while (j < 32);

平台信息

微控制器是 Atmel/Microchip AT91SAMD21G18A。
架构是 ARMv6-M。
微架构是 ARM Cortex-M0+。
我的 MCU 内核的主时钟频率是 48 MHz。
在 48 MHz 时，如果高速缓存被禁用，SAMD21 [非易失性] 程序存储器需要 1 个等待状态。
在 48 MHz 时，SAMD21 SRAM 需要零等待状态。

但是，我看不出有什么理由可以更快地从 RAM 执行代码。我相信 NVM 数据路径与 RAM 数据路径是分开的，因此从 NVM 获取的指令永远不应与从 RAM 获取的数据竞争（我不是 100% 确定这一事实，但我认为这是真的。）。因此，如果 NVM 控制器缓存按文档说明工作，那么从 NVM 运行此循环似乎几乎肯定会比从 RAM 运行此循环更快。

SAMD21 有一个 64 字节的高速缓存，用于访问非易失性存储器。
NVM 控制器缓存“是一种直接映射的缓存，实现了 8 行 64 位（即 64 字节）”。
NVM 控制器缓存在NO_MISS_PENALTY模式下启用。
这是NO_MISS_PENALTY模式的数据表描述：“NVM 控制器（缓存系统）不会在缓存未命中时插入等待状态。提供最佳系统性能。”
数据表没有提供有关NO_MISS_PENALTY模式的更多信息。

score 0 · Accepted Answer

Cortex-M0+ 使用冯诺依曼架构。无论是在零等待状态的 SRAM 中还是在闪存中，指令取指总是与数据访问相冲突。