下面的函数几乎与以下函数一样快memcpy
:
/*
* Created on: 2018. 5. 5.
* Author: Jake 'Alquimista' Lee
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix
// void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
pDst .req x0
pRed .req x1
pGrn .req x2
pBlu .req x3
count .req w4
.balign 64
.func
alquimista_ndarray_to_matrix:
mov x16, #(640*360) & 0xffff
str q8, [sp, #-16]!
movk x16, #((640*360)>>16), lsl #16
mov count, #(640*360)/128
add pGrn, pRed, x16
add pBlu, pRed, x16, lsl #1
b 1f
.balign 64
1:
ldp q0, q3, [pRed], #32
ldp q1, q4, [pGrn], #32
ldp q2, q5, [pBlu], #32
ldp q6, q16, [pRed], #32
ldp q7, q17, [pGrn], #32
ldp q8, q18, [pBlu], #32
ldp q19, q22, [pRed], #32
ldp q20, q23, [pGrn], #32
ldp q21, q24, [pBlu], #32
ldp q25, q28, [pRed], #32
ldp q26, q29, [pGrn], #32
ldp q27, q30, [pBlu], #32
subs count, count, #1
st3 {v0.16b, v1.16b, v2.16b}, [pDst], #48
st3 {v3.16b, v4.16b, v5.16b}, [pDst], #48
st3 {v6.16b, v7.16b, v8.16b}, [pDst], #48
st3 {v16.16b, v17.16b, v18.16b}, [pDst], #48
st3 {v19.16b, v20.16b, v21.16b}, [pDst], #48
st3 {v22.16b, v23.16b, v24.16b}, [pDst], #48
st3 {v25.16b, v26.16b, v27.16b}, [pDst], #48
st3 {v28.16b, v29.16b, v30.16b}, [pDst], #48
b.gt 1b
.balign 8
ldr q8, [sp], #16
ret
.endfunc
.end
为了获得最高性能和最低功耗,您可能希望将源指针对齐为 32 字节,将目标指针对齐为 16 字节。
函数原型为:
void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
下面是进行动态转换的函数float
。
我添加了批号作为参数,这样您就不必为每个图像进行函数调用。
/*
* Created on: 2018. 5. 5.
* Copyright: Jake 'Alquimista' Lee. All rights reserved
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix_float
// void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);
pDst .req x0
pRed .req x1
batch .req w2
pGrn .req x3
pBlu .req x4
stride .req x5
count .req w7
.balign 64
.func
alquimista_ndarray_to_matrix_float:
mov stride, #((640*360)<<1) & 0xffff
stp q8, q15, [sp, #-32]!
movk stride, #((640*360)>>15), lsl #16
mov count, #(640*360)/32
add pGrn, pRed, stride, lsr #1
add pBlu, pRed, stride
b 1f
.balign 64
1:
ldp q0, q1, [pRed], #32
ldp q2, q3, [pGrn], #32
ldp q4, q5, [pBlu], #32
subs count, count, #1
ushll v20.8h, v0.8b, #7
ushll2 v23.8h, v0.16b, #7
ushll v26.8h, v1.8b, #7
ushll2 v29.8h, v1.16b, #7
ushll v21.8h, v2.8b, #7
ushll2 v24.8h, v2.16b, #7
ushll v27.8h, v3.8b, #7
ushll2 v30.8h, v3.16b, #7
ushll v22.8h, v4.8b, #7
ushll2 v25.8h, v4.16b, #7
ushll v28.8h, v5.8b, #7
ushll2 v31.8h, v5.16b, #7
ursra v20.8h, v20.8h, #8
ursra v21.8h, v21.8h, #8
ursra v22.8h, v22.8h, #8
ursra v23.8h, v23.8h, #8
ursra v24.8h, v24.8h, #8
ursra v25.8h, v25.8h, #8
ursra v26.8h, v26.8h, #8
ursra v27.8h, v27.8h, #8
ursra v28.8h, v28.8h, #8
ursra v29.8h, v29.8h, #8
ursra v30.8h, v30.8h, #8
ursra v31.8h, v31.8h, #8
uxtl v0.4s, v20.4h
uxtl v1.4s, v21.4h
uxtl v2.4s, v22.4h
uxtl2 v3.4s, v20.8h
uxtl2 v4.4s, v21.8h
uxtl2 v5.4s, v22.8h
uxtl v6.4s, v23.4h
uxtl v7.4s, v24.4h
uxtl v8.4s, v25.4h
uxtl2 v15.4s, v23.8h
uxtl2 v16.4s, v24.8h
uxtl2 v17.4s, v25.8h
uxtl v18.4s, v26.4h
uxtl v19.4s, v27.4h
uxtl v20.4s, v28.4h
uxtl2 v21.4s, v26.8h
uxtl2 v22.4s, v27.8h
uxtl2 v23.4s, v28.8h
uxtl v24.4s, v29.4h
uxtl v25.4s, v30.4h
uxtl v26.4s, v31.4h
uxtl2 v27.4s, v29.8h
uxtl2 v28.4s, v30.8h
uxtl2 v29.4s, v31.8h
ucvtf v0.4s, v0.4s, #15
ucvtf v1.4s, v1.4s, #15
ucvtf v2.4s, v2.4s, #15
ucvtf v3.4s, v3.4s, #15
ucvtf v4.4s, v4.4s, #15
ucvtf v5.4s, v5.4s, #15
ucvtf v6.4s, v6.4s, #15
ucvtf v7.4s, v7.4s, #15
ucvtf v8.4s, v8.4s, #15
ucvtf v15.4s, v15.4s, #15
ucvtf v16.4s, v16.4s, #15
ucvtf v17.4s, v17.4s, #15
ucvtf v18.4s, v18.4s, #15
ucvtf v19.4s, v19.4s, #15
ucvtf v20.4s, v20.4s, #15
ucvtf v21.4s, v21.4s, #15
ucvtf v22.4s, v22.4s, #15
ucvtf v23.4s, v23.4s, #15
ucvtf v24.4s, v24.4s, #15
ucvtf v25.4s, v25.4s, #15
ucvtf v26.4s, v26.4s, #15
ucvtf v27.4s, v27.4s, #15
ucvtf v28.4s, v28.4s, #15
ucvtf v29.4s, v29.4s, #15
st3 {v0.4s - v2.4s}, [pDst], #48
st3 {v3.4s - v5.4s}, [pDst], #48
st3 {v6.4s - v8.4s}, [pDst], #48
st3 {v15.4s - v17.4s}, [pDst], #48
st3 {v18.4s - v20.4s}, [pDst], #48
st3 {v21.4s - v23.4s}, [pDst], #48
st3 {v24.4s - v26.4s}, [pDst], #48
st3 {v27.4s - v29.4s}, [pDst], #48
b.gt 1b
add pRed, pRed, stride
add pGrn, pGrn, stride
add pGrn, pGrn, stride
subs batch, batch, #1
mov count, #(640*360)/32
b.gt 1b
.balign 8
ldp q8, q15, [sp], #32
ret
.endfunc
.end
这是一个相当长的一个,它会比上面的一个要长得多uint8
。
请注意,它将非常好地扩展到多核执行。
函数原型为:
void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);