1

我正在执行复数双精度(ZGEMM)的矩阵矩阵乘法,并认为使用 SSE 会有所帮助,但实际上它会减慢代码速度。我想知道,也许是因为它的内存受限?

下面是伪代码:

为了将两个复数相乘,我使用以下方法,如 intel 建议的那样(假设 real 和 complex 连续存储):

如果 M=(a+ib) 且 IN= (c+id):

M1 = _mm_loaddup_pd(&M[0]);//M1->|a|a|
I1 = _mm_load_pd(&IN);//I1->|d|c|
T1 = _mm_mul_pd(M1,I1);//T1->|a*d|a*c|

M1 = _mm_loaddup_pd(&M[1]);//M1->|b|b|
I1 = _mm_shuffle_pd(I1,I1,1);//I1->|c|d|
I1 = _mm_mul_pd(M1,I1);//I1->|b*c|b*d|

T1 = _mm_addsub_pd(T1,I1); //T1-> |ad+bc|ac-bd|

因此 T1 存储复数乘法的结果。

这是矩阵乘法(C[i][j] += A[i][k]*B[k][j]):

/*Assumes real and imaginary elements are stored contiguously*/
/*Used loop order: ikj for better cache locality and used 2*2 block matrix mult*/
for(i=0;i<N;i+=2){
  for(k=0;k<N;k+=2){
    /*Perform the _mm_loaddup() part here for A[i][k],A[i][k+1],A[i+1][k],A[i+1][k+1] since im blocking for 2*2 matrix mult i.e will load duplicates of 8 double values into 8 SIMD registers here*/
     A00r = _mm_loaddup_pd(&A[(i*N+k)*2+0]);
     A00i = _mm_loaddup_pd(&A[(i*N+k)*2+1]);
     A01r = _mm_loaddup_pd(&A[(i*N+k)*2+2]);
     A01i = _mm_loaddup_pd(&A[(i*N+k)*2+3]);
     A10r = _mm_loaddup_pd(&A[((i+1)*N+k)*2+0]);
     A10i = _mm_loaddup_pd(&A[((i+1)*N+k)*2+1);
     A11r = _mm_loaddup_pd(&A[((i+1)*N+k)*2+2);
     A11i = _mm_loaddup_pd(&A[((i+1)*N+k)*2+2);
     for(j=0;j<N;j+=2){
       double out[8] = {0,0,0,0,0,0,0,0};
       op00=op01=op10=op11=_mm_setzero_pd();

       B00 = _mm_loadu_pd(&B[(k*N+j)*2]);
       B00_r = _mm_shuffle_pd(B00,B00,1);
       B01 = _mm_loadu_pd(&B[(k*N+j+1)*2]);
       B01_r = _mm_shuffle_pd(B01,B01,1);

       /*Perform A[i][k]*B[k][j], A[i][k]*B[k][j+1], A[i+1][k]*B[k][j], A[i+1][k]*B[k][j+1]  and assign it to op00,op01,op10,op11 respectively -> takes 8 _mm_mul_pd() and 4 _mm_addsub_pd()*/

       T1 = _mm_mul_pd(A00r,B00);
       T2 = _mm_mul_pd(A00i,B00_r);
       op00 = _mm_addsub_pd(T1,T2);

       T1 = _mm_mul_pd(A00r,B01);
       T2 = _mm_mul_pd(A00i,B01_r);
       op01 = _mm_addsub_pd(T1,T2);

       T1 = _mm_mul_pd(A10r,B00);
       T2 = _mm_mul_pd(A10i,B00_r);
       op10 = _mm_addsub_pd(T1,T2);

       T1 = _mm_mul_pd(A10r,B01);
       T2 = _mm_mul_pd(A10i,B01_r);
       op11 = _mm_addsub_pd(T1,T2);

       B00 = _mm_loadu_pd(&B[((k+1)*N+j)*2]);
       B00_r = _mm_shuffle_pd(B00,B00,1);
       B01 = _mm_loadu_pd(&B[((k+1)*N+j+1)*2]);
       B00_r = _mm_shuffle_pd(B01,B01,1);

      /*Perform A[i][k+1]*B[k+1][j],A[i][k+1]*B[k+1][j+1],A[i+1][k+1]*B[k+1][j],A[i+1][k+1]*B[k+1][j+1] and add it to op00,op01,op10,op11 respectively-> takes 8 _mm_mul_pd(), 4 _mm_add_pd(), 4 _mm_addsub_pd()*/

       T1 = _mm_mul_pd(A01r,B10);
       T2 = _mm_mul_pd(A01i,B10_r);
       op00 = _mm_add_pd(op00,_mm_addsub_pd(T1,T2));

       T1 = _mm_mul_pd(A01r,B11);
       T2 = _mm_mul_pd(A01i,B11_r);
       op01 = _mm_add_pd(op01,_mm_addsub_pd(T1,T2));

       T1 = _mm_mul_pd(A11r,B10);
       T2 = _mm_mul_pd(A11i,B10_r);
       op10 = _mm_add_pd(op10,_mm_addsub_pd(T1,T2));

       T1 = _mm_mul_pd(A11r,B11);
       T2 = _mm_mul_pd(A11i,B11_r);
       op11 = _mm_add_pd(op11,_mm_addsub_pd(T1,T2));

      /*Store op00,op01,op10,op11 into out[0],out[2],out[4] and out[6] -> 4 stores*/

       _mm_storeu_pd(&out[0],op00);
       _mm_storeu_pd(&out[2],op01);
       _mm_storeu_pd(&out[4],op10);
       _mm_storeu_pd(&out[6],op11);
      /*Perform the following 8 operations*/
      C[(i*N+j)*2+0] += out[0];
      C[(i*N+j)*2+1] += out[1];
           .
           .
           .
      C[((i+1)*N+j)*2+3] += out[7];
     }
  }
}

L1 缓存为 32KB,所以我也使用了缓存阻塞(切片大小为 16*16,这使得工作集大小为 12KB(3*2^4*2^4*2^3*2))但它没有没什么帮助。我只能得到大约 50% 的理论峰值性能。关于如何改进这一点的任何指示?

4

0 回答 0