1

我编写了一个计算密集型程序,并用 对其进行了分析cachegrind,这表明数据缓存未命中可能是主要瓶颈。我调整(将计算单元的大小减小到 CPU 缓存大小)我的程序的缓冲区大小和缓存未命中减少了一半。但仍然大约 5%,还有什么我可以优化的吗?

调整前

==2729== 
==2729== I   refs:      20,104,725,617
==2729== I1  misses:             1,899
==2729== LLi misses:             1,835
==2729== I1  miss rate:           0.00%
==2729== LLi miss rate:           0.00%
==2729== 
==2729== D   refs:       6,754,924,837  (3,968,410,253 rd   + 2,786,514,584 wr)
==2729== D1  misses:       697,140,469  (  549,560,569 rd   +   147,579,900 wr)
==2729== LLd misses:        74,594,136  (   74,241,144 rd   +       352,992 wr)
==2729== D1  miss rate:           10.3% (         13.8%     +           5.2%  )
==2729== LLd miss rate:            1.1% (          1.8%     +           0.0%  )
==2729== 
==2729== LL refs:          697,142,368  (  549,562,468 rd   +   147,579,900 wr)
==2729== LL misses:         74,595,971  (   74,242,979 rd   +       352,992 wr)
==2729== LL miss rate:             0.2% (          0.3%     +           0.0%  )

==6996== I   refs:      645,316,413
==6996== I1  misses:          1,884
==6996== LLi misses:          1,628
==6996== I1  miss rate:        0.00%
==6996== LLi miss rate:        0.00%
==6996== 
==6996== D   refs:      215,556,739  (127,281,049 rd   + 88,275,690 wr)
==6996== D1  misses:      9,460,159  (  6,718,647 rd   +  2,741,512 wr)
==6996== LLd misses:         20,887  (      6,607 rd   +     14,280 wr)
==6996== D1  miss rate:         4.3% (        5.2%     +        3.1%  )
==6996== LLd miss rate:         0.0% (        0.0%     +        0.0%  )
==6996== 
==6996== LL refs:         9,462,043  (  6,720,531 rd   +  2,741,512 wr)
==6996== LL misses:          22,515  (      8,235 rd   +     14,280 wr)
==6996== LL miss rate:          0.0% (        0.0%     +        0.0%  )

附上Mysticial建议的最耗时的功能。PS galois_region_xorXOR 两个内存区域并galois_w08_region_multby_2_64做类似的事情,它们都很耗时,但确实经过优化。

void shift_coding(  const GMatrixU8& mat,
             const int w,
             unsigned char * const out_buff,
             unsigned char * const in_buff,
             const unsigned long& size_in_buff ){

 unsigned int val;
 unsigned int mask_;
 unsigned int mask;
 unsigned char * pcom;
 int k = mat.cc;
 int m = mat.rr;
 unsigned long size_comp_buff = size_in_buff/k;
 bool start_2;
 unsigned char * psrc;
 unsigned char * pdes;

 pcom = (unsigned char *)malloc(size_comp_buff);
 memset(out_buff , 0 , size_in_buff*m/k);
mask_ = 1<<(w-1);

for(int j = 0 ; j < k ; ++j){
    psrc = in_buff + j*size_comp_buff;
    for(int i = 0 ; i < m ; ++i){
        pdes = out_buff + i*size_comp_buff;

        val = mat.ele[i*k + j];
        memset(pcom, 0, size_comp_buff);
        start_2 = false;

        if(0 == val){continue;}
        if(1 == val){galois_region_xor(psrc, pdes, pdes, size_comp_buff); continue;}

        for(mask = mask_ ; 0 < mask; mask >>=1){
            if(mask & 1){ 
                if(val & 1){ 
                    galois_region_xor(psrc , pcom , pcom , size_comp_buff);
                }
                continue;
            }
            if(0 != (val & mask)){
                start_2 = true;
                galois_region_xor(psrc , pcom , pcom , size_comp_buff);
                galois_w08_region_multby_2_64(pcom , size_comp_buff);
            }else{
                if(start_2){
                    galois_w08_region_multby_2_64(pcom , size_comp_buff);
                }
            }
        }
        galois_region_xor(pcom , pdes , pdes , size_comp_buff);
    }
}
free(pcom);

}

4

0 回答 0