-3

我想将此 C 程序转换为 ARM NEON 程序集:

int main()
{
    int str1[]={1,2,3,4,5,6,7,8,9,10};
    int str2[]={11,12,3,4,8,1,4,5,8,3};
    int str3[10],i;

    for(i=0;i<10;i++)
    {
        str3[i] = str1[i]+str2[i];
    }
}

谢谢。

4

3 回答 3

2

让我们在 gcc 中作弊和使用<arm_neon.h>,看看 gcc 生成什么代码(neon-add.c):

#include <assert.h>
#include <stdio.h>
#include <arm_neon.h>

#define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))

__attribute__((noinline, noclone))
static void print(int r[])
{
    int i;
    for (i = 0; i < 10; i++) {
        printf("%d: %d\n", i, r[i]);
    }
}

int main()
{
    int i, j;
    int str1[]={1,2,3,4,5,6,7,8,9,10};
    int str2[]={11,12,3,4,8,1,4,5,8,3};
    int result[10];
    enum {
        WIDTH = 2,
    };
    int *s1, *s2, *res;

    for (i = 0, j = 0, s1 = str1, s2 = str2, res = result; i < ARRAY_LEN(str1);
            i+= WIDTH, j++, s1 += WIDTH, s2 += WIDTH, res += WIDTH) {
        int32x2_t t1, t2;
        t1 = vld1_s32(s1);
        t2 = vld1_s32(s2);
        t2 = vadd_s32(t2, t1);
        vst1_s32(res, t2);
    }

    print(result);

    return 0;
}

向 gcc 询问程序集:

$ arm-linux-gnueabihf-gcc -Os -mfpu=neon -S neon-add.c 

你会得到统一语法的 ARM 程序集:

    .syntax unified
    .arch armv7-a
    .eabi_attribute 27, 3
    .eabi_attribute 28, 1
    .fpu neon
    .eabi_attribute 20, 1
    .eabi_attribute 21, 1
    .eabi_attribute 23, 3
    .eabi_attribute 24, 1
    .eabi_attribute 25, 1
    .eabi_attribute 26, 2
    .eabi_attribute 30, 4
    .eabi_attribute 34, 1
    .eabi_attribute 18, 4
    .thumb
    .file   "neon-add.c"
    .text
    .align  1
    .thumb
    .thumb_func
    .type   print, %function
print:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    push    {r3, r4, r5, lr}
    mov r5, r0
    movs    r4, #0
.L2:
    mov r1, r4
    ldr r2, [r5, r4, lsl #2]
    ldr r0, .L4
    adds    r4, r4, #1
    bl  printf
    cmp r4, #10
    bne .L2
    pop {r3, r4, r5, pc}
.L5:
    .align  2
.L4:
    .word   .LC2
    .size   print, .-print
    .section    .text.startup,"ax",%progbits
    .align  1
    .global main
    .thumb
    .thumb_func
    .type   main, %function
main:
    @ args = 0, pretend = 0, frame = 120
    @ frame_needed = 0, uses_anonymous_args = 0
    push    {r4, r5, lr}
    sub sp, sp, #124
    ldr r4, .L9
    mov r5, sp
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4, {r0, r1}
    adds    r4, r4, #8
    stmia   r5, {r0, r1}
    add r5, sp, #40
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    movs    r3, #0
    ldmia   r4, {r0, r1}
    stmia   r5, {r0, r1}
    b   .L7
.L8:
    vld1.32 {d16}, [r2]
    vld1.32 {d17}, [r1]
    vadd.i32    d16, d17, d16
    vst1.32 {d16}, [r0]
.L7:
    add r4, sp, #40
    add r2, sp, #80
    adds    r1, r3, r4
    add r4, sp, #0
    adds    r0, r3, r2
    adds    r2, r3, r4
    adds    r3, r3, #8
    cmp r3, #48
    bne .L8
    add r0, sp, #80
    bl  print
    movs    r0, #0
    add sp, sp, #124
    pop {r4, r5, pc}
.L10:
    .align  2
.L9:
    .word   .LANCHOR0
    .size   main, .-main
    .section    .rodata
    .align  2
.LANCHOR0 = . + 0
.LC0:
    .word   1
    .word   2
    .word   3
    .word   4
    .word   5
    .word   6
    .word   7
    .word   8
    .word   9
    .word   10
.LC1:
    .word   11
    .word   12
    .word   3
    .word   4
    .word   8
    .word   1
    .word   4
    .word   5
    .word   8
    .word   3
    .section    .rodata.str1.1,"aMS",%progbits,1
.LC2:
    .ascii  "%d: %d\012\000"
    .ident  "GCC: (crosstool-NG linaro-1.13.1-4.7-2013.02-01-20130221 - Linaro GCC 2013.02) 4.7.3 20130205 (prerelease)"
    .section    .note.GNU-stack,"",%progbits
于 2013-05-15T12:44:41.710 回答
1

10 对于 SIMD 优化来说不是一个非常有用的大小,但为了论证,您可以使用 64 位 NEON 内在函数执行以下操作:

#include <arm_neon.h>

int main()
{
    int str1[]={1,2,3,4,5,6,7,8,9,10};
    int str2[]={11,12,3,4,8,1,4,5,8,3};
    int str3[10];
    int i;

    for (i = 0; i < 10; i += 2)
    {
        int32x2t v1, v2, v3;

        v1 = vld1_s32(&str1[i]);
        v2 = vld1_s32(&str2[i]);
        v3 = vadd_s32(v1, v2);
        vst1_s32(&str3[i], v3);
    }

    return 0;
}
于 2013-05-15T12:43:43.253 回答
0

你可以简单地这样做

在函数中声明数组

.align .STR1: .word 1,2,3,4,5,6,7,8,9,10

稍后像这样使用数组:

LDR R0,=.STR1            @loading the start address of the array into R0
于 2014-05-20T15:01:44.760 回答