assembly - 如何反转一个字节

Question

我正在处理一个项目，碰巧我必须颠倒一个字节的顺序。我目前正在使用 AVR Studio Mega32 微控制器。

例如：

0000 0001 becomes 1000 0000
0001 0110 becomes 0110 1000
1101 1001 becomes 1001 1011

首先，我有这个：

ldi r20,0b00010110

反转字节以使 r20 变为 01101000 的最简单方法是什么？

score 3 · Accepted Answer

这是一个片段——它是为 GNU 工具链（avr-gcc、binutils、avr-libc 等）编写的——但它应该很容易适应：

static inline __attribute__ ((always_inline))
uint8_t avr_reverse_byte (uint8_t x)
{
    x = ((x & 0x55) << 1) | ((x & 0xaa) >> 1);
    x = ((x & 0x33) << 2) | ((x & 0xcc) >> 2);

    /* x = ((x & 0x0f) << 4) | ((x & 0xf0) >> 4); */

    __asm__ ("swap %0" : "=r" (x) : "0" (x)); /* swap nibbles. */

    return x;
}

swap因此，除了用指令实现的最终高低半字节交换外，对“C”代码没有太大的改进。

score 2 · Accepted Answer

Another easy way is to use the carry flag:

Repeat 8x:

lsl r20 ; shift one bit into the carry flag
ror r0  ; rotate carry flag into result

(Input in r20, output in r0, content of r20 destroyed; registers may be changed freely.)

This uses 16 instructions @ 2 bytes, 1 cycle each = 32 bytes of program memory and 16 cycles to reverse one byte when completely 'unrolled'. Wrapped in a loop, the code size can be reduced but execution time will increase.

score 2 · Accepted Answer

我现在无法提供 AVR 代码。但一般位反转技术如下：

abcd efgh   p
badc fehg   p = ((p and 0AAh) shr 1) or ((p shl 1) and 0AAh)
dcba hgfe   p = ((p and 033h) shr 2) or ((p shl 2) and 033h)
hgfe dcba   p = ((p and 00Fh) shr 4) or ((p shl 4) and 0F0h)

score 1 · Accepted Answer

字节的两半的 4 位（16 个条目）查找表看起来是一个很好的折衷方案（正如@Aki 在评论中指出的那样）。

AVR 指令每条是 2 个字节，因此 16 字节的表占用的空间与 8 条指令相同。（事实证明，速度或大小不值得，除非您可以将数组对齐 256 个字节以使索引比 gcc 便宜得多。）

可以使用每个字节的高半部分和低半部分来打包 LUT。但这将花费更多的索引（在屏蔽之前使用索引的第 4 位上的分支以有条件地交换）而不是节省表大小（8 字节 = 4 条指令）。

让我们比较一下 AVR GCC 的作用。在编译 Brett 的代码时，gcc4.6 有令人惊讶的可怕的错过优化（实际上是提升到int而不是充分利用将结果截断为 uint8_t）。具有讽刺意味的是，它确实使用 SWAP 指令优化x<<4 | x>>4为旋转。（AVR 没有多计数旋转指令，并且正常旋转是通过进位旋转。移位仅适用于每条指令一个计数。）

#include <stdint.h>

uint8_t reverse_byte_alu (uint8_t x)
{
    uint8_t xeven = x & 0x55,  xodd = x & 0xaa;
    x = (xeven << 1) | (xodd >> 1);  // swap adjacent bit pairs

    xeven = x & 0x33,  xodd = x & 0xcc;
    x = (xeven << 2) | (xodd >> 2);  // swap adjacent 2-bit chunks

    x = ((x << 4) | (x >> 4));  // 4-bit rotate is recognized as SWAP
    return x;
}

gcc4.6 -O3使用(Godbolt compiler explorer)编译成这个 asm。我没有看到任何错过的优化。

reverse_byte_alu:
    mov r25,r24
    andi r25,lo8(85)
    lsl r25
    andi r24,lo8(-86)
    lsr r24
    or r25,r24              # swap of adjacent bits done

    mov r24,r25
    andi r24,lo8(51)
    lsl r24
    lsl r24                 # << 2
    andi r25,lo8(-52)
    lsr r25
    lsr r25                 # >> 2
    or r24,r25              # swap pairs of bits done

    swap r24                # swap nibbles
    ret

16 条指令，每条 2 字节，全部为 1 周期。（除外ret） https://www.microchip.com/webdoc/avrassembler/avrassembler.wb_instruction_list.html

所以这比@JimmyB 的答案略好，后者需要 16 条单周期指令，不包括ret. （但它可以卷成一个小循环）。

数组索引在 AVR 中并不便宜。寻址模式的唯一选择是后递增或前递减，没有立即位移。所以必须将 16 位数组地址添加到 4 位半字节值。如果您的数组位于低 256 字节的地址空间中，这可能会更便宜。或者，如果您的数组是 256 字节对齐的，您可以只设置指针寄存器的高字节并将查找值放在低字节中。（gcc 错过了这个优化）。

告诉 gcc 将数组对齐在 16 字节边界上会使地址计算更便宜，但总指令数为 18，其中一些是多周期指令。

__attribute__((aligned(16)))  // makes indexing cheaper
static const uint8_t reverse_nibble[16] = {
        0,      0b1000, 0b0100, 0b1100,
        0b0010, 0b1010, 0b0110, 0b1110,
        0b0001, 0b1001, 0b0101, 0b1101,
        0b0011, 0b1011, 0b0111, 0b1111
        };

uint8_t reverse_byte_nibble_LUT (uint8_t x) {
    uint8_t hi = reverse_nibble[x>>4];
    hi = ((hi << 4) | (hi >> 4));  // SWAP instead of SWAP+AND for just a shift
    uint8_t lo = reverse_nibble[x & 0x0f];
    return hi | lo;
}

编译成 17 条指令，LD 是 2 周期指令，在非递增/递减寻址模式下访问 FLASH 时。（当不访问闪存或内部 SRAM 时，在某些 CPU 上为 1 个周期）。

  # gcc4.6 output, not optimal
    mov r26,r24
    swap r26
    andi r26,lo8(15)
    ldi r27,lo8(0)
    subi r26,lo8(-(reverse_nibble))   # AVR doesn't have add-immediate byte, only sub
    sbci r27,hi8(-(reverse_nibble))   # X register = (x>>4) - (-LUT_base)
    ld r25,X
    swap r25
    mov r30,r24
    ldi r31,lo8(0)
    andi r30,lo8(15)
    andi r31,hi8(15)                 # missed opt, this is a double-redundant 0 & 0
    subi r30,lo8(-(reverse_nibble))
    sbci r31,hi8(-(reverse_nibble))
    ld r24,Z
    or r24,r25
    ret

ldi r27, 0/sbci r27可能是一个错过的优化。使用 16 字节对齐的表，我们无法进入高字节。我认为我们可以这样做：

    # generate r30 = x&0x0f
    subi r30,lo8(-(reverse_nibble))   # ORI would work, too.  no-op with 256-byte aligned table
    ldi  r31,hi8(reverse_nibble)      # reuse this for hi and lo

因此，这可能会以更好的索引在速度上领先，但总大小（代码 + 表）肯定看起来更糟。

score 1 · Accepted Answer

通过一些调整，可以从评论中的代码片段（我的）获得额外的性能。

当我们确保 LUT 与 16 字节边界对齐时，我们可以通过异或生成地址。我们也可以通过索引对表进行异或，允许x就地修改参数。我已经注释掉了 GCC 生成的不必要的指令。

__attribute__((aligned(16)))  // makes indexing cheaper
static const uint8_t reverse_nibble_xor[16] = {
        0 ^ 0,  1 ^ 0b1000, 2 ^ 0b0100, 3 ^ 0b1100,
        4 ^ 0b0010, 5 ^ 0b1010, 6 ^ 0b0110, 7 ^ 0b1110,
        8 ^ 0b0001, 9 ^ 0b1001, 10 ^ 0b0101, 11 ^ 0b1101,
        12 ^ 0b0011, 13 ^ 0b1011, 14 ^ 0b0111, 15 ^ 0b1111
        };


uint8_t reverse_ams(uint8_t x)
{
    uint8_t *p = (uint8_t *)((uint16_t)reverse_nibble_xor ^ (x & 15));
    x ^= p[0];
    x = ((x << 4) | (x >> 4));
    uint8_t *q = (uint8_t *)((uint16_t)reverse_nibble_xor ^ (x & 15));
    x ^= q[0];
    return x;
}

reverse_ams:
        ldi r18,lo8(reverse_nibble)
//      ldi r19,hi8(reverse_nibble)
        ldi r31,hi8(reverse_nibble)  // use r31 directly instead of r19
        mov r30,r24
//        ldi r31,lo8(0)
        andi r30,lo8(15)
//        andi r31,hi8(15)
        eor r30,r18
//        eor r31,r19
        ld r25,Z
        eor r25,r24
        swap r25
        mov r30,r25
//        ldi r31,lo8(0)
        andi r30,lo8(15)
//        andi r31,hi8(15)
        eor r30,r18
//        eor r31,r19
        ld r24,Z
        eor r24,r25
        ret

assembly - 如何反转一个字节

5 回答 5

Related

Reference