c++ - 为什么此代码在实际 BCM2837（pi 3）上运行时会挂起，但在 qemu 上运行良好

Question

考虑以下函数：

_atomic_raw_lock:
.global _atomic_raw_lock
.type _atomic_raw_lock, %function
1:  ldxr        x9, [x0]                //atomic load from memory pointed to by x0
    cbz         x9, 2f                  //branch if zero (unlocked)
    wfe                                 //sleep if locked
    b           1b
2:  mov         x9, #0x01               //set x9 to be LOCKED (1).
    stxr        w10, x9, [x0]
    dsb         sy
    cbz         w10, 3f                 // atomic store success?
    b           1b
3:  ret

该函数从 c 代码调用，并将地址存储到 x0 中的 64 位整数作为其第一个参数。该程序在 qemu 中按预期运行。我通过在调用此函数的行之前和之后设置硬件引脚来确定这是问题区域。这个函数永远不会返回。x9 和 w10 是调用者保存的（我假设 c 调用代码会自动保存这些）。

详细信息：在 pi 3 b+ 上运行，使用来自 toolchains.bootlin.com 的 aarch64-buildroot-linux-gnu（glibc 静态链接）交叉链编译在调用此函数之前，一切都在真实硬件上运行良好。 构建命令：

aarch64-linux-g++ -g -std=gnu++17 -O0 -Wall -fno-exceptions -Wextra -Wno-attributes -fno-asynchronous-unwind-tables -Wno-sign-compare -c start.S -o 
aarch64-linux-ld start.o main.o -T link.ld -o kernel8.elf -l:libc.a -l:libstdc++.a -l:libc.so -l:libgcc_s.so

ld文件：

OUTPUT_ARCH(aarch64)
ENTRY(_start)
    MEMORY
    {
        RAM (xrw)   : ORIGIN = 0x80000, LENGTH = 0x40000000
    }
SECTIONS
{
    PROVIDE(__start_prog_mem = .); /*I may want to use this in c somehow*/
    .text : /*The place where code goes*/
    { 
        KEEP(*(.text.boot)) /*Keep this even if it is not used*/
        *(.text .text.* .gnu.linkonce.t*) /*all variations of text and
        the gnu generated (for the c ability) text sections*/
    } > RAM /*These sections now belong in .text*/
    .bss : /*Uninitialized data goes here, will not load at runtime.*/
    {
        . = ALIGN(4); /*align the current location to 4 bytes*/
        __bss_start = .; /*define __bss_start to be at the current location.*/
        *(.bss .bss.* .gnu.linkonce.b.*)
        *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
        *(COMMON)
        . = ALIGN(4); /*align the current location to 4 bytes*/
        __bss_end = .; /*Define __bss_end to be at the current location.*/
    } > RAM
    .data : /*Initialized global and static data*/
    {
        . = ALIGN(4); /*align the current location to 4 bytes*/
        __data_begin = .;
        *(.data .data.* .gnu.linkonce.d*) /*Put all data sections here.*/
        *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*)
        . = ALIGN(4);
        __data_end = .;
    } > RAM
    .rodata : /*const data*/
    {
        . = ALIGN(4); /*align the current location to 4 bytes*/
        __rodata_begin = .;
        *(.rodata .rodata.* .gnu.linkonce.r*) /*All const data sections, 
        including the gnu leftovers*/
        *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*)
        . = ALIGN(4);
        __rodata_end = .;
    } > RAM
    .stack_core0 :
    {
        . = ALIGN(16); /*stack must be 16 byte aligned*/
        __stack_start_core0 = .;
        . = . + 1024;  /*el0 size*/
        __el0_stack_core0 = .; /*el0 stack*/
        . = . + 1048576; /*el1 size*/
        __el1_stack_core0 = .; /*el1 stack*/
        PROVIDE(__el1_stack_core0 = .);
        . = . + 16384; /*el2 stack size*/
        __el2_stack_core0 = .;
        . = ALIGN(16);
        __stack_end_core0 = .;
    } > RAM
    .stack_core1 :
    {
        . = ALIGN(16); /*stack must be 16 byte aligned*/
        __stack_start_core1 = .;
        . = . + 1024;  /*el0 size*/
        __el0_stack_core1 = .; /*el0 stack*/
        . = . + 1048576; /*el1 size*/
        __el1_stack_core1 = .; /*el1 stack*/
        PROVIDE(__el1_stack_core1 = .);
        . = . + 16384; /*el2 stack size*/
        __el2_stack_core1 = .;
        . = ALIGN(16);
        __stack_end_core1 = .;
    } > RAM
    .stack_core2 :
    {
        . = ALIGN(16); /*stack must be 16 byte aligned*/
        __stack_start_core2 = .;
        . = . + 1024;  /*el0 size*/
        __el0_stack_core2 = .; /*el0 stack*/
        . = . + 1048576; /*el1 size*/
        __el1_stack_core2 = .; /*el1 stack*/
        PROVIDE(__el1_stack_core2 = .);
        . = . + 16384; /*el2 stack size*/
        __el2_stack_core2 = .;
        . = ALIGN(16);
        __stack_end_core2 = .;
    } > RAM
    .stack_core3 :
    {
        . = ALIGN(16); /*stack must be 16 byte aligned*/
        __stack_start_core3 = .;
        . = . + 1024;  /*el0 size*/
        __el0_stack_core3 = .; /*el0 stack*/
        . = . + 1048576; /*el1 size*/
        __el1_stack_core3 = .; /*el1 stack*/
        PROVIDE(__el1_stack_core3 = .);
        . = . + 16384; /*el2 stack size*/
        __el2_stack_core3 = .;
        . = ALIGN(16);
        __stack_end_core3 = .;
    } > RAM
    _end = .; /*Define _end to be at the current location*/
    PROVIDE(__end_prog_mem = .); /*I may want to use this in c somehow*/
    .heap :
    {
        . = ALIGN(4);
        __heap_start = .;
    } > RAM
    /DISCARD/ : /*Any sections listed here will not be included*/
    {
        /* *(.comment*) /*Exclude any comments made by the compiler*/
        /* *(.gnu*) /*Exclude any version numbers included by the compiler*/
        /* *(.note*) /*Exclude any notes made by the compiler.*/
        /* *(.eh_frame*) /*This sections is ecluded because it contains asyncronous
        unwind tbles we dont need.*/
    } 
}

__bss_size = (__bss_end - __bss_start) >> 3; /*Define the symbol to hold the 
size of the .bss sections.  This size will be in single units of 8 bytes 
(due to shift >> 3)*/
__prog_mem_size = (__end_prog_mem - __start_prog_mem);

删除了原子锁定机制（最顶部的代码列表）允许程序运行，但由于非线程安全代码，输出混乱。即使是简单的 bool 标志也不起作用，因为正常的 read-modify-write 不是多核安全的。

问题重述：为什么它在模拟器（qemu）上运行，而不是在实际硬件上运行。上面的 _atomic_raw_lock 函数列表已确认在真实硬件上而不是在 qemu 上调用时挂起。

编辑：代码在 QEMU 中运行良好，这让我相信这不是死锁。我已经通过运行测试确认了这一点，其中所有程序所做的只是锁定一次并永远旋转。在这种情况下问题仍然存在。

score 1 · Accepted Answer

如果提供的内存不是NORMAL ，它可能会（不规则地）失败。

Qemu 不太可能对内存属性进行建模并模仿此类故障。

（对 stxr 感到抱歉，我的心思在 cswap 中）。

您可能会考虑将迭代计数放入循环中，以便在调试器/崩溃转储挂在其中时将其救出。

c++ - 为什么此代码在实际 BCM2837（pi 3）上运行时会挂起，但在 qemu 上运行良好

1 回答 1

Related

Reference