c - scatterv 后的分段错误

Question

     /**
     * BLOCK_LOW
     * Returns the offset of a local array
     * with regards to block decomposition
     * of a global array.
     *
     * @param  (int) process rank
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) offset of local array in global array
     */
    #define BLOCK_LOW(id, p, n) ((id)*(n)/(p))

    /**
     * BLOCK_HIGH
     * Returns the index immediately after the
     * end of a local array with regards to
     * block decomposition of a global array.
     *
     * @param  (int) process rank
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) offset after end of local array
     */
    #define BLOCK_HIGH(id, p, n) (BLOCK_LOW((id)+1, (p), (n)))

    /**
     * BLOCK_SIZE
     * Returns the size of a local array
     * with regards to block decomposition
     * of a global array.
     *
     * @param  (int) process rank
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) size of local array
     */
    #define BLOCK_SIZE(id, p, n) ((BLOCK_HIGH((id), (p), (n))) - (BLOCK_LOW((id), (p), (n))))

    /**
     * BLOCK_OWNER
     * Returns the rank of the process that
     * handles a certain local array with
     * regards to block decomposition of a
     * global array.
     *
     * @param  (int) index in global array
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) rank of process that handles index
     */
    #define BLOCK_OWNER(i, p, n) (((p)*((i)+1)-1)/(n))



    /*Matricefilenames:
      small matrix A.bin of dimension 100 × 50
      small matrix B.bin of dimension 50 × 100
      large matrix A.bin of dimension 1000 × 500
      large matrix B.bin of dimension 500 × 1000

    An MPI program should be implemented such that it can
    • accept two file names at run-time,
    • let process 0 read the A and B matrices from the two data files,
    • let process 0 distribute the pieces of A and B to all the other processes,
    • involve all the processes to carry out the the chosen parallel algorithm
    for matrix multiplication C = A * B ,
    • let process 0 gather, from all the other processes, the different pieces
    of C ,
    • let process 0 write out the entire C matrix to a data file.
    */


    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>
    #include "mpi-utils.c"
    void read_matrix_binaryformat (char*, double***, int*, int*);
    void write_matrix_binaryformat (char*, double**, int, int);
    void create_matrix (double***,int,int);
    void matrix_multiplication (double ***, double ***, double ***,int,int, int);

    int main(int argc, char *argv[]) {
        int id,p; // Process rank and total amount of processes
        int rowsA, colsA, rowsB, colsB; // Matrix dimensions
        double **A; // Matrix A
        double **B; // Matrix B
        double **C; // Result matrix C : AB
        int local_rows; // Local row dimension of the matrix A
        double **local_A; // The local A matrix
        double **local_C;  // The local C matrix

        MPI_Init (&argc, &argv);
        MPI_Comm_rank (MPI_COMM_WORLD, &id);
        MPI_Comm_size (MPI_COMM_WORLD, &p);

        if(argc != 3) {
            if(id == 0) {
                printf("Usage:\n>> %s matrix_A matrix_B\n",argv[0]);
            }       
            MPI_Finalize();
            exit(1);
        }

        if (id == 0) {
            read_matrix_binaryformat (argv[1], &A, &rowsA, &colsA);
            read_matrix_binaryformat (argv[2], &B, &rowsB, &colsB);
        }

        if (p == 1) {
            create_matrix(&C,rowsA,colsB);
            matrix_multiplication (&A,&B,&C,rowsA,colsB,colsA);

            char* filename = "matrix_C.bin";
            write_matrix_binaryformat (filename, C, rowsA, colsB);
            free(A);
            free(B);
            free(C);
            MPI_Finalize();
            return 0;
        }


        // For this assignment we have chosen to bcast the whole matrix B:
        MPI_Bcast (&B, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 
        MPI_Bcast (&colsA, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast (&colsB, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast (&rowsA, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast (&rowsB, 1, MPI_INT, 0, MPI_COMM_WORLD);

        local_rows = BLOCK_SIZE(id, p, rowsA);


        /*    SCATTER VALUES    */

        int *proc_elements = (int*)malloc(p*sizeof(int)); // amount of elements for each processor
        int *displace = (int*)malloc(p*sizeof(int)); // displacement of elements for each processor
        int i;
        for (i = 0; i<p; i++) {
            proc_elements[i] = BLOCK_SIZE(i, p, rowsA)*colsA;
            displace[i] = BLOCK_LOW(i, p, rowsA)*colsA;
        }

        create_matrix(&local_A,local_rows,colsA);

        MPI_Scatterv(&A[0],&proc_elements[0],&displace[0],MPI_DOUBLE,&local_A[0],
                     local_rows*colsA,MPI_DOUBLE,0,MPI_COMM_WORLD);

        /*    END  SCATTER  VALUES  */  

        create_matrix (&local_C,local_rows,colsB);
        matrix_multiplication (&local_A,&B,&local_C,local_rows,colsB,colsA);

        /*    GATHER VALUES    */

        MPI_Gatherv(&local_C[0], rowsA*colsB, MPI_DOUBLE,&C[0],
              &proc_elements[0],&displace[0],MPI_DOUBLE,0, MPI_COMM_WORLD);

        /*    END  GATHER VALUES  */

        char* filename = "matrix_C.bin";
        write_matrix_binaryformat (filename, C, rowsA, colsB);  

        free (proc_elements);
        free (displace);    
        free (local_A);
        free (local_C);
        free (A);
        free (B);
        free (C);   
        MPI_Finalize ();
        return 0;
    }

    void create_matrix (double ***C,int rows,int cols) {
        *C = (double**)malloc(rows*sizeof(double*));
        (*C)[0] = (double*)malloc(rows*cols*sizeof(double));
        int i;
        for (i=1; i<rows; i++)
            (*C)[i] = (*C)[i-1] + cols;
    }

    void matrix_multiplication (double ***A, double ***B, double ***C, int rowsC,int colsC,int colsA) {
        double sum;
        int i,j,k;
        for (i = 0; i < rowsC; i++) {
            for (j = 0; j < colsC; j++) {
                sum = 0.0;
                for (k = 0; k < colsA; k++) {
                    sum = sum + (*A)[i][k]*(*B)[k][j];
                }
                (*C)[i][j] = sum;
            }
        }
    }

    /* Reads a 2D array from a binary file*/ 
    void read_matrix_binaryformat (char* filename, double*** matrix, int* num_rows, int* num_cols) {
        int i;
        FILE* fp = fopen (filename,"rb");
        fread (num_rows, sizeof(int), 1, fp);
        fread (num_cols, sizeof(int), 1, fp);
        /* storage allocation of the matrix */
        *matrix = (double**)malloc((*num_rows)*sizeof(double*));
        (*matrix)[0] = (double*)malloc((*num_rows)*(*num_cols)*sizeof(double));
        for (i=1; i<(*num_rows); i++)
            (*matrix)[i] = (*matrix)[i-1]+(*num_cols);
        /* read in the entire matrix */
        fread ((*matrix)[0], sizeof(double), (*num_rows)*(*num_cols), fp);
        fclose (fp);
    }

    /* Writes a 2D array in a binary file */
    void write_matrix_binaryformat (char* filename, double** matrix, int num_rows, int num_cols) {
      FILE *fp = fopen (filename,"wb");
      fwrite (&num_rows, sizeof(int), 1, fp);
      fwrite (&num_cols, sizeof(int), 1, fp);
      fwrite (matrix[0], sizeof(double), num_rows*num_cols, fp);
      fclose (fp);
    }

我的任务是对矩阵 A 和 B 进行并行矩阵乘法，并将结果收集到矩阵 C 中。

我通过将矩阵 A 划分为行段来做到这一点，每个进程将使用它的部分来乘以矩阵 B，并从乘法中取回它的部分。然后我将收集过程中的所有部分并将它们放在一起到矩阵 C 中。

我已经发布了一个类似的问题，但是这段代码得到了改进，我已经取得了进步，但是在 scatterv 调用之后我仍然遇到分段错误。

score 3 · Accepted Answer

所以我马上看到了一些问题：

    MPI_Bcast (&B, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

在这里，您传递的不是指向双精度的指针，而是指向指向双精度的指针的指针（B 定义为double **B），并且您告诉 MPI 跟随该指针并从那里发送 1 个双精度。那是行不通的。

你可能认为你在这里完成的是发送指向矩阵的指针，所有任务都可以从中读取数组——这是行不通的。这些进程不共享公共内存空间（这就是 MPI 被称为分布式内存编程的原因），并且指针不会去任何地方。你实际上将不得不发送矩阵的内容，

    MPI_Bcast (&(B[0][0]), rowsB*colsB, MPI_DOUBLE, 0, MPI_COMM_WORLD);

并且您将必须确保其他进程提前为 B 矩阵正确分配了内存。

其他地方也有类似的指针问题：

    MPI_Scatterv(&A[0], ..., &local_A[0]

同样， A 是指向 doubles ( double **A)的指针的指针local_A，并且您需要将 MPI 指向指向 doubles 的指针才能使其工作，例如

    MPI_Scatterv(&(A[0][0]), ..., &(local_A[0][0])

该错误似乎存在于所有通信例程中。

请记住，任何看起来像(buffer, count, TYPE)MPI 的东西都意味着 MPI 例程跟随指针buffer并在那里发送下一个count类型的数据TYPE。MPI 无法跟踪您发送的缓冲区中的指针，因为通常它不知道它们在那里。它只(count * sizeof(TYPE))从指针中获取下一个字节，buffer并与它们进行适当的通信。所以你必须向它传递一个指向 TYPE 类型数据流的指针。

说了这么多，如果你把事情缩小一点，和你一起工作会容易得多；现在您发布的程序包含许多无关紧要的 I/O 内容，这意味着没有人可以在不先弄清楚矩阵格式然后自己生成两个矩阵的情况下运行您的程序来查看会发生什么。在发布有关源代码的问题时，您真的想发布（a）一小部分源代码，它（b）重现问题并且（c）完全独立。

score 2 · Accepted Answer

考虑这是一个扩展评论，因为 Jonathan Dursi 已经给出了相当详尽的答案。您的矩阵确实以一种奇怪的方式表示，但至少您遵循了针对其他问题的建议，并将它们作为连续块分配空间，而不是为每一行单独分配空间。

鉴于此，您应该替换：

MPI_Scatterv(&A[0],&proc_elements[0],&displace[0],MPI_DOUBLE,&local_A[0],
             local_rows*colsA,MPI_DOUBLE,0,MPI_COMM_WORLD);

和

MPI_Scatterv(A[0],&proc_elements[0],&displace[0],MPI_DOUBLE,local_A[0],
             local_rows*colsA,MPI_DOUBLE,0,MPI_COMM_WORLD);

A[0]已经指向矩阵数据的开头，不需要指向它。这同样适用local_A[0]于调用的参数MPI_Gatherv()。

已经说过很多次了——MPI 不进行指针追踪，只适用于平面缓冲区。

我还注意到您的代码中的另一个错误 - 未正确释放矩阵的内存。您只是释放指针数组而不是矩阵数据本身：

free(A);

真的应该变成

free(A[0]); free(A);

c - scatterv 后的分段错误

2 回答 2

Related

Reference