2

我正在努力确定为什么我的 Strassen 实施如此缓慢。它为每次迭代分配内存,但我会酌情释放它。

int** multiply(int** a, int** b, int size)
{
int row,col,i,j;

if(size == 1)
{
    int** c = allocate(size);
    c[0][0] = (a[0][0] * b[0][0])%2;
    return c;
}

if(size <= 2)
{
    int a11,a12,a21,a22,b11,b12,b21,b22;    
    int** c = allocate(size);
    a11 = a[0][0];
    a12 = a[0][1];
    a21 = a[1][0];
    a22 = a[1][1];
    b11 = b[0][0];
    b12 = b[0][1];
    b21 = b[1][0];
    b22 = b[1][1];

    c[0][0] = (a11*b11 + a12*b21)%2;
        c[0][1] = (a11*b12 + a12*b22)%2;
        c[1][0] = (a21*b11 + a22*b21)%2;
    c[1][1] = (a21*b12 + a22*b22)%2;
        return c;
}

int** c = allocate(size);
size = size/2;

int** A11 = allocate(size);
int** A12 = allocate(size);
int** A21 = allocate(size);
int** A22 = allocate(size);
int** B11 = allocate(size);
int** B12 = allocate(size);
int** B21 = allocate(size);
int** B22 = allocate(size);

for(i=0;i<size;i++)
{
    for(j=0;j<size;j++)
    {
        A11[i][j] = a[i][j];    
        A12[i][j] = a[i][j+size];
        A21[i][j] = a[i+size][j];
        A22[i][j] = a[i + size][j + size];
        B11[i][j] = b[i][j];
        B12[i][j] = b[i][j + size];
        B21[i][j] = b[i + size][j];
        B22[i][j] = b[i + size][j + size];
    }
}

int** S1 = subtract(B12,B22,size);
int** S2 = add(A11,A12, size);
int** S3 = add(A21,A22, size);
int** S4 = subtract(B21,B11, size);
int** S5 = add(A11,A22, size);
int** S6 = add(B11,B22, size);
int** S7 = subtract(A12,A22, size);
int** S8 = add(B21,B22, size);
int** S9 = subtract(A11,A21, size);
int** S10 = add(B11,B12, size);

int** P1 = multiply(A11, S1, size);
int** P2 = multiply(S2, B22, size);
int** P3 = multiply(S3, B11, size);
int** P4 = multiply(A22, S4, size);
int** P5 = multiply(S5, S6, size);
int** P6 = multiply(S7, S8, size);
int** P7 = multiply(S9, S10,size);

int** c11 = subtract(add(P5,P4,size), add(P2,P6,size), size);
int** c12 = add(P1,P2,size);
int** c21 = add(P3,P4,size);
int** c22 = subtract(add(P5,P1,size), subtract(P3,P7,size), size);

int** temp = add(P5,P4,size);
int** temp2 = add(P2,P6, size);

for(i=0; i< size; i++)
{
    for(j=0; j< size; j++)
    {
        c[i][j] = abs(c11[i][j] % 2);           
        c[i][j+size] = abs(c12[i][j] % 2);
        c[i+size][j] = abs(c21[i][j] % 2);
        c[i+size][j+size] = abs(c22[i][j] % 2);
    }
}

deallocate(A11, size);
deallocate(A12, size);
deallocate(A21, size);
deallocate(A22, size);
deallocate(B11, size);
deallocate(B12, size);
deallocate(B21, size);
deallocate(B22, size);
deallocate(c11, size);
deallocate(c12, size);
deallocate(c21, size);
deallocate(c22, size);
deallocate(P1, size);
deallocate(P2, size);
deallocate(P3, size);
deallocate(P4, size);
deallocate(P5, size);
deallocate(P6, size);
deallocate(P7, size);
deallocate(S1, size);
deallocate(S2, size);
deallocate(S3, size);
deallocate(S4, size);
deallocate(S5, size);
deallocate(S6, size);
deallocate(S7, size);
deallocate(S8, size);
deallocate(S9, size);
deallocate(S10, size);
deallocate(temp, size);
deallocate(temp2, size);
return c;
}
4

1 回答 1

6

由于您的所有分配都是固定大小的,因此请为所有缓冲区分配一个足够大的内存块,然后为每个变量在该缓冲区中使用适当的偏移量。最后,您只释放一个缓冲区。

我不确定所有内存分配/释放是导致此功能运行缓慢的原因,但这是一个不错的选择。您可能希望对其进行概要分析以确保确定。

于 2013-04-29T17:42:22.133 回答