我想将此 C 程序转换为 ARM NEON 程序集:
int main()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10],i;
for(i=0;i<10;i++)
{
str3[i] = str1[i]+str2[i];
}
}
谢谢。
让我们在 gcc 中作弊和使用<arm_neon.h>
,看看 gcc 生成什么代码(neon-add.c):
#include <assert.h>
#include <stdio.h>
#include <arm_neon.h>
#define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))
__attribute__((noinline, noclone))
static void print(int r[])
{
int i;
for (i = 0; i < 10; i++) {
printf("%d: %d\n", i, r[i]);
}
}
int main()
{
int i, j;
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int result[10];
enum {
WIDTH = 2,
};
int *s1, *s2, *res;
for (i = 0, j = 0, s1 = str1, s2 = str2, res = result; i < ARRAY_LEN(str1);
i+= WIDTH, j++, s1 += WIDTH, s2 += WIDTH, res += WIDTH) {
int32x2_t t1, t2;
t1 = vld1_s32(s1);
t2 = vld1_s32(s2);
t2 = vadd_s32(t2, t1);
vst1_s32(res, t2);
}
print(result);
return 0;
}
向 gcc 询问程序集:
$ arm-linux-gnueabihf-gcc -Os -mfpu=neon -S neon-add.c
你会得到统一语法的 ARM 程序集:
.syntax unified
.arch armv7-a
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 4
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "neon-add.c"
.text
.align 1
.thumb
.thumb_func
.type print, %function
print:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r3, r4, r5, lr}
mov r5, r0
movs r4, #0
.L2:
mov r1, r4
ldr r2, [r5, r4, lsl #2]
ldr r0, .L4
adds r4, r4, #1
bl printf
cmp r4, #10
bne .L2
pop {r3, r4, r5, pc}
.L5:
.align 2
.L4:
.word .LC2
.size print, .-print
.section .text.startup,"ax",%progbits
.align 1
.global main
.thumb
.thumb_func
.type main, %function
main:
@ args = 0, pretend = 0, frame = 120
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, lr}
sub sp, sp, #124
ldr r4, .L9
mov r5, sp
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4, {r0, r1}
adds r4, r4, #8
stmia r5, {r0, r1}
add r5, sp, #40
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
movs r3, #0
ldmia r4, {r0, r1}
stmia r5, {r0, r1}
b .L7
.L8:
vld1.32 {d16}, [r2]
vld1.32 {d17}, [r1]
vadd.i32 d16, d17, d16
vst1.32 {d16}, [r0]
.L7:
add r4, sp, #40
add r2, sp, #80
adds r1, r3, r4
add r4, sp, #0
adds r0, r3, r2
adds r2, r3, r4
adds r3, r3, #8
cmp r3, #48
bne .L8
add r0, sp, #80
bl print
movs r0, #0
add sp, sp, #124
pop {r4, r5, pc}
.L10:
.align 2
.L9:
.word .LANCHOR0
.size main, .-main
.section .rodata
.align 2
.LANCHOR0 = . + 0
.LC0:
.word 1
.word 2
.word 3
.word 4
.word 5
.word 6
.word 7
.word 8
.word 9
.word 10
.LC1:
.word 11
.word 12
.word 3
.word 4
.word 8
.word 1
.word 4
.word 5
.word 8
.word 3
.section .rodata.str1.1,"aMS",%progbits,1
.LC2:
.ascii "%d: %d\012\000"
.ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2013.02-01-20130221 - Linaro GCC 2013.02) 4.7.3 20130205 (prerelease)"
.section .note.GNU-stack,"",%progbits
10 对于 SIMD 优化来说不是一个非常有用的大小,但为了论证,您可以使用 64 位 NEON 内在函数执行以下操作:
#include <arm_neon.h>
int main()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10];
int i;
for (i = 0; i < 10; i += 2)
{
int32x2t v1, v2, v3;
v1 = vld1_s32(&str1[i]);
v2 = vld1_s32(&str2[i]);
v3 = vadd_s32(v1, v2);
vst1_s32(&str3[i], v3);
}
return 0;
}
你可以简单地这样做
在函数中声明数组
.align
.STR1:
.word 1,2,3,4,5,6,7,8,9,10
稍后像这样使用数组:
LDR R0,=.STR1 @loading the start address of the array into R0