c - 如何使用并行计算机 (MPI) 将矩阵相乘

Question

我正在努力让我的代码使用 MPI 运行。我正在尝试实现矩阵乘法。

我的代码是这样的

有两个矩阵 A 和 B
分散A的行
广播矩阵 B
计算
收集

我已经编写了代码...但是代码运行不正确...我遇到了分段错误。

我不知道为什么会发生这种情况......我尝试了很多调整代码......但似乎总是有问题。

有人可以检查这段代码并告诉我为什么代码不起作用吗？

我已经添加了评论：

“散射矩阵”

“收集答案”等等......所以即使你可以通过程序的分散部分告诉我为什么它不正确，我会很感激！

#define N 512
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include "mpi.h"
void print_results(char *prompt, float result[N][N]);
int main(int argc, char *argv[])
{
int i, j, k;
MPI_Status status;
int process_rank;//rank of a process
int no_of_processes;//no. of processes

int Master_To_Slave = 0;
int Slave_To_Master = 5;

float a[N][N], b[N][N], c[N][N];
char *usage = "Usage: %s file\n";
FILE *fd;
double elapsed_time, start_time, end_time;
struct timeval tv1, tv2;
if (argc < 2) {
fprintf (stderr, usage, argv[0]);
return -1;
}
if ((fd = fopen (argv[1], "r")) == NULL) {
fprintf (stderr, "%s: Cannot open file %s for reading.\n",
argv[0], argv[1]);
fprintf (stderr, usage, argv[0]);
return -1;
}
// Read input from file for matrices a and b.
// The I/O is not timed because this I/O needs
// to be done regardless of whether this program
// is run sequentially on one processor or in
// parallel on many processors. Therefore, it is
// irrelevant when considering speedup.
for (i = 0; i < N; i++)
{
    for (j = 0; j < N; j++)
        {
            fscanf (fd, "%f", &a[i][j]);

        }
}
for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
        fscanf (fd, "%f", &b[i][j]);

int num_of_rows_A = N;
int num_of_cols_A = N;

int num_of_rows_B = N;
int num_of_cols_B = N;

int lower_index_of_A;
int upper_index_of_A;

//TODO: Add a barrier prior to the time stamp.
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Barrier(MPI_COMM_WORLD); //Added Barrier


// Take a time stamp
gettimeofday(&tv1, NULL);

//TODO: Scatter the input matrices a and b.


    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank); //get the rank
    MPI_Comm_size(MPI_COMM_WORLD, &no_of_processes); //get number of processes

    if(process_rank==0){
        fprintf (stderr, "Main process started");
        fprintf (stderr, "No. of process %d",no_of_processes);
        fprintf (stderr, "\n\n");
        fprintf (stderr, "\n\n");

    for( i=1; i<no_of_processes;i++)
    {
        int rows_per_process = num_of_rows_A/(no_of_processes-1);

         lower_index_of_A = (i-1)*rows_per_process;
       // fprintf (stderr, "Current lower Index of A %s",lower_index_of_A);

        if(i+1==no_of_processes &&((num_of_rows_A%(no_of_processes-1))!=0))
        {
            upper_index_of_A=num_of_rows_A; 
           // fprintf (stderr, "Current upper Index of A %s",upper_index_of_A);
        }
        else
        {         
            upper_index_of_A=lower_index_of_A+rows_per_process;
           // fprintf (stderr, "Current upper Index of A %s",upper_index_of_A);
        }
        fprintf (stderr, "Lower index of A %d", lower_index_of_A);
        fprintf (stderr, "Upper index of A %d", upper_index_of_A);
        fprintf (stderr, "\n\n");

        MPI_Send(&lower_index_of_A,1,MPI_INT,i,Master_To_Slave,MPI_COMM_WORLD); //send lower index
        MPI_Send(&upper_index_of_A,1,MPI_INT,i,Master_To_Slave+1,MPI_COMM_WORLD);//send upper  index

        MPI_Send(&a[lower_index_of_A][0],(upper_index_of_A-lower_index_of_A)*num_of_cols_A,MPI_DOUBLE,i,Master_To_Slave+2,MPI_COMM_WORLD);//send rows of A
        fprintf (stderr, "Scatter done");

    }
    MPI_Bcast(&b, num_of_rows_A*num_of_cols_B, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    fprintf (stderr, "Broadcast done");
    }
    else
    {


        MPI_Recv(&lower_index_of_A, 1, MPI_INT, 0, Master_To_Slave,MPI_COMM_WORLD, &status);

        MPI_Recv(&upper_index_of_A, 1, MPI_INT, 0, Master_To_Slave+1,MPI_COMM_WORLD,&status);

        MPI_Recv(&a[lower_index_of_A], (upper_index_of_A-lower_index_of_A)*num_of_cols_A, MPI_DOUBLE,0, Master_To_Slave+2,MPI_COMM_WORLD, &status);

        //TODO: Add code to implement matrix multiplication (C=AxB) in parallel.


        for( i=lower_index_of_A;i<upper_index_of_A;i++)
        {
            for( j=0;j<num_of_cols_B;j++)
            {
                for( k=0;k<num_of_rows_B;k++)
                {

                    c[i][j]+=(a[i][k]*b[k][j]);

                }
            }
        }

        MPI_Send(&lower_index_of_A, 1, MPI_INT, 0, Slave_To_Master,MPI_COMM_WORLD);
        MPI_Send(&upper_index_of_A, 1, MPI_INT, 0, Slave_To_Master+1,MPI_COMM_WORLD);
        MPI_Send(&c[lower_index_of_A], (upper_index_of_A-lower_index_of_A)*num_of_cols_B, MPI_DOUBLE,0, Slave_To_Master+2,MPI_COMM_WORLD);
    }




//TODO: Gather partial result back to the master process.
    if(process_rank==0)
    {   
        for(i=1;i<no_of_processes;i++)
        {

            MPI_Recv(&lower_index_of_A, 1, MPI_INT, i, Slave_To_Master, MPI_COMM_WORLD, &status);
            //receive upper bound from a slave
            MPI_Recv(&upper_index_of_A, 1, MPI_INT, i, Slave_To_Master + 1, MPI_COMM_WORLD, &status);
            //receive processed data from a slave
            MPI_Recv(&c[lower_index_of_A][0], (upper_index_of_A - lower_index_of_A) * num_of_cols_B, MPI_DOUBLE, i, Slave_To_Master + 2, MPI_COMM_WORLD, &status);
        }

    }




// Take a time stamp. This won't happen until after the master
// process has gathered all the input from the other processes.
gettimeofday(&tv2, NULL);
elapsed_time = (tv2.tv_sec - tv1.tv_sec) +
((tv2.tv_usec - tv1.tv_usec) / 1000000.0);
printf ("elapsed_time=\t%lf (seconds)\n", elapsed_time);
// print results
print_results("C = ", c);
}
void print_results(char *prompt, float result[N][N])
{
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %.2f", result[i][j]);
}
printf ("\n");
}
printf ("\n\n");
}

score 3 · Accepted Answer

每当发生这种类型的事情时，我都会启动一个调试器。

我总是推荐并行调试器Allinea DDT，但我有偏见，因为我是开发它的团队之一，但它有助于发现这种错误。您也可以尝试 GDB，但这需要更多的手动干预来处理多个进程。

在您的情况下，您发布的代码在我的 MPI（Open MPI）中没有段错误，它实际上挂在 proc 0 中的 MPI_Bcast 处，其余的则挂在循环分支中的 MPI_Sends 处。这是因为 rank 1 及以上没有调用 MPI_Bcast：它们需要匹配发送者、rank 0 并接收数据。

为什么不下载调试器并亲自查看 - 一旦您修复了此广播不匹配，调试器将在您遇到您正在寻找的分段错误时立即停止您的程序并告诉您故障所在。

c - 如何使用并行计算机 (MPI) 将矩阵相乘

1 回答 1

Related

Reference