我正在使用以下程序来检查PLD
对性能的影响。但是,我无法找到使用和不PLD
使用我编写的 C 代码的性能差异。有什么我遗漏的或我需要添加的任何编译器选项吗?
int arra[6144] = {0}; /*15kb*/
int arrb[6144] = {0}; /*15kb*/
int arrc[6144] = {0}; /*15kb*/
int arrd[2048] = {0}; /*5kb*/
int arre[2048] = {0}; /*5kb*/
int arrf[2048] = {0}; /*5kb*/
int arrg[2048] = {0}; /*5kb*/
int arrh[2048] = {0}; /*5kb*/
int arri[2048] = {0}; /*5kb*/
int arrj[2048] = {0}; /*5kb*/
int arrk[2048] = {0}; /*5kb*/
int arrl[2048] = {0}; /*5kb*/
int main()
{
int csize;
int i,z = 3;
int loop_i;
int32x4_t viarrd,viarre,viarrf;
int32x4_t viarrg,viarrh,viarri;
int32x4_t viarrj,viarrk,viarrl;
asm("LDR r1, =arrd");
asm("LDR r2, =arre");
asm("LDR r3, =arrf");
asm("LDR r4, =arrg");
asm("LDR r5, =arrh");
asm ("PLD [r1]");
asm ("PLD [r2]");
asm ("PLD [r3]");
asm ("PLD [r4]");
asm ("PLD [r5]");
for(loop_i=0;loop_i<100;loop_i++)
{
for(i=0;i<2048;i++)
{
arrd[i] = 5;
arre[i] = 5;
arrf[i] = 5;
arrg[i] = 5;
arrh[i] = 5;
arri[i] = 5;
arrj[i] = 5;
arrk[i] = 5;
arrl[i] = 5;
}
for(i=0;i<2048;i+=4)
{
viarrf = vld1q_s32(&arrf[i]);
viarre = vld1q_s32(&arre[i]);
viarrd = vmulq_s32(viarrf,viarre);
vst1q_s32(&arrd[i],viarrd);
}
for(i=0;i<2048;i+=4)
{
viarrg = vld1q_s32(&arrg[i]);
viarrh = vld1q_s32(&arrh[i]);
viarri = vmulq_s32(viarrg,viarrh);
vst1q_s32(&arri[i],viarri);
}
for(i=0;i<2048;i+=4)
{
viarrj = vld1q_s32(&arrj[i]);
viarrk = vld1q_s32(&arrk[i]);
viarrl = vmulq_s32(viarrj,viarrk);
vst1q_s32(&arrl[i],viarrl);
}
for(i=0;i<2048;i+=4)
{
viarrd = vld1q_s32(&arrd[i]);
viarrf = vld1q_s32(&arrf[i]);
viarre = vmulq_s32(viarrd,viarrf);
vst1q_s32(&arre[i],viarre);
}
for(i=0;i<2048;i+=4)
{
viarrg = vld1q_s32(&arrg[i]);
viarri = vld1q_s32(&arri[i]);
viarrh = vmulq_s32(viarrg,viarri);
vst1q_s32(&arrh[i],viarrh);
}
}