您好,我在 Raspberry Pi 3 上完成了我的第一个汇编程序实现。我想问您如何改进算法。它基本上应该做的是:在一个 320x240 uint8_t 数组中,它分析每个点并从中创建两个位掩码。位掩码是通过将中心点与半径为 3 的圆上的圆角像素进行比较来创建的。如果该圆上的像素小于中心减去阈值,则 regLO 掩码得到 1,否则为 0。如果该圆上的像素大于中心加上阈值,则 regHI 为 1,否则为 0。每次比较后,regHi 和 regLO 都移动一个。这样我们最终得到一个像素越来越低的位掩码。该算法为 FAST-9 算法奠定了基础。
[编辑]:我知道 c++/c 代码与我的汇编代码类似(实际上在 c++ 中需要 19 毫秒,在汇编中需要 17 毫秒)。但我正在学习汇编程序。我也知道 SIMD 更快,但我想先学习基本的汇编程序。
[EDIT2]:添加了 c++ 和 SIMD 实现
#include <iostream>
#include <stdint.h>
#include <chrono>
#include <ctime>
using namespace std;
#define HT 240
#define WT 320
#define WTHT 76800
#define WT3 960
typedef std::chrono::high_resolution_clock clock2;
typedef std::chrono::microseconds res;
int main() {
clock2::time_point t1, t2 ,t3;
uint32_t result = 0;
volatile uint8_t arr[WTHT];
for(int i=0;i<WT*HT;i++){
arr[i]=9;
}
arr[3]=7;
arr[4]=10;
arr[3+3*WT]=17;
t1 = clock2::now();
volatile uint8_t *pnt;
for(int iy=WT3;iy<(WTHT)-WT3;iy+=WT){
pnt=&arr[iy+2];
for(int ix=3;ix<WT-3;ix++){
uint32_t resultlo = 0;
uint32_t resulthi = 0;
++pnt;
asm volatile(
//loading the center value in r0
"ldrb r0, [%[in], #963]\n\t"
//r0 forms the lower boundary
"sub r0, r0,#8\n\t"
//r2 forms the higher boundary
"add r2,r0,#16\n\t"
//Load of first pixel 3 pixel above center in r1
"ldrb r1, [%[in], #3]\n\t"
//compare r1 to lower boundary
"cmp r1,r0\n\t"
//thumb it instruction add one to regLo if lower
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end1 \n\t"
//compare r1 to higher boundary
"cmp r1,r2\n\t"
//thumb IT instruction add one to regHi if higher
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end1: \n\t"
//shift both bitmasks by one
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
//analyze next pixel
"ldrb r1, [%[in], #4]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end2 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end2: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #325]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end3 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end3: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #646]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end4 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end4: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #966]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end5 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end5: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1286]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end6 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end6: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1605]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end7 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end7: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1924]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end8 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end8: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1923]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end9 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end9: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1922]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end10 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end10: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1601]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end11 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end11: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #1280]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end12 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end12: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #960]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end13 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end13: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #640]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end14 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end14: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #321]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end15 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end15: \n\t"
"lsl %[out],%[out],#1\n\t"
"lsl %[outhi],%[outhi],#1\n\t"
"ldrb r1, [%[in], #2]\n\t"
"cmp r1,r0\n\t"
"itt lo \n\t"
"addlo %[out],%[out], #1\n\t"
"blo end16 \n\t"
"cmp r1,r2\n\t"
"it hi \n\t"
"addhi %[outhi],%[outhi], #1\n\t"
"end16: \n\t"
:[out]"=r"(resultlo),[outhi]"=r"(resulthi): [in]"r" (pnt):"r0","r1","r2");
}
}
t2 = clock2::now();
std::cout << "Elapsed time is "
<< std::chrono::duration_cast<res>(t2-t1).count()<< " microseconds.\n";
return 0;
}
[c++]
uint64_t r1=0;
uint64_t r2=0;
uint32_t result2=0;
uint32_t result3=0;
{
for(int iy=WT3;iy<((WTHT)-WT3);iy+=WT){
pnt=&arr[iy];
for(int ix=3;ix<(WT-3);ix++){
result2=0;
result3=0;
//get center point value
const uint8_t c=*(pnt+963);
//set lower bound
const uint8_t l=c-8;
//set upper bound
const uint8_t h=c+8;
//get first pixel value
uint8_t p=*(pnt+3);
//is it above uper bound
if(p>h){
++result2;
//or maybe below lower bound
} else if(p<l){
++result3;
}
//shift both
result2=result2<<1;
result3=result3<<1;
//set to next pixel value
p=*(pnt+4);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+325);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+646);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+966);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1286);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1605);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1924);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1923);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1922);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1601);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+1280);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+960);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+640);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+321);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
p=*(pnt+2);
if(p>h){
++result2;
}
else if(p<l){
++result3;
}
result2=result2<<1;
result3=result3<<1;
//set pointer to next pixel
++pnt;
//prevent code part for beeing optimized out
r1+=result2;
r2+=result3;
}
}
}
[SIMD 或 NEON 本能]
for(int iy=WT3;iy<((WTHT)-WT3);iy+=WT){
pnt=&arr[iy-WT3];
for(int ix=3;ix<(WT-3);++ix){
//set center value
uint8_t c1=*(pnt+963);
//set lower bound
uint8_t l1=c1-8;
//set uper bound
uint8_t h1=c1+8;
//load all values from circle in one array
uint8_t ps1[16]={*(pnt+3),*(pnt+4),*(pnt+325),*(pnt+646),
*(pnt+966),*(pnt+1286),*(pnt+1605),*(pnt+1924),
*(pnt+1923),*(pnt+1922),*(pnt+1601),*(pnt+1280),
*(pnt+960),*(pnt+640),*(pnt+321),*(pnt+2)};
//load this array in neon register
uint8x16_t t1=vld1q_u8(ps1);
//Load one uint8x16 vector with same value (higher bound)
uint8x16_t hl1 =vld1q_dup_u8(&h1);
//Load one uint8x16 vector with same value (lower bound)
uint8x16_t ll1 =vld1q_dup_u8(&l1);
//Vector compare less-than
uint8x16_t rl=vcltq_u8(t1,ll1);
//Vector compare greater-than
uint8x16_t rh=vcgtq_u8(t1,hl1);
++pnt;
}
}
如果您能指出我可以对该代码进行的一些优化以使其运行得更快,那就太好了
汇编程序的执行时间为 17 毫秒 c/c++ 与 O2 标志:19 毫秒 SIMD:44 毫秒