1

我是 Trilinos 的新手。我编写了一个程序,它从文件中读取数据并用数据构造一个稀疏矩阵。全局 id 可以非常高,超过 32 位整数范围。当我将 id 更改为较小的值时。一切正常。如果我使用一个进程运行 ie mpiexec -np 1 ./myprogram,就可以了。但是,对于多个进程,它会崩溃。

数据是这样的:文件中的每一行代表矩阵中的一行。在每一行中:第一个值是行 id,第二个值表示该行中的列数。之后,有几个索引值对。示例文件(小 id):

   2000791 3 2000791 0.5 1000791 0.5 3000791 1.0
   1000791 2 1000791 0.5 2000791 0.5
   3000791 2 3000791 0.5 1000791 0.5
   3000792 2 3000791 0.5 1000791 0.5

示例文件(大 id):

   2000000000000791 3 2000000000000791 0.5 1000000000000791 0.5 3000000000000791 1.0
   1000000000000791 2 1000000000000791 0.5 2000000000000791 0.5
   3000000000000791 2 3000000000000791 0.5 1000000000000791 0.5
   3000000000000792 2 3000000000000791 0.5 1000000000000791 0.5

从 gdb 的输出和 Trilinos 的源代码来看,错误似乎是由Epetra_BlockMap::ConstructAutoUniform: Error. Not enough space for elements on each processor.

下面附上调试消息和我的程序的源代码。

#0  0x00007ffff58b55c9 in raise () from /lib64/libc.so.6
#1  0x00007ffff58b6cd8 in abort () from /lib64/libc.so.6
#2  0x00007ffff61b99d5 in __gnu_cxx::__verbose_terminate_handler() ()
   from /lib64/libstdc++.so.6
#3  0x00007ffff61b7946 in ?? () from /lib64/libstdc++.so.6
#4  0x00007ffff61b7973 in std::terminate() () from /lib64/libstdc++.so.6
#5  0x00007ffff61b7b9f in __cxa_throw () from /lib64/libstdc++.so.6
#6  0x00000000004c6d2a in Epetra_BlockMap::ConstructAutoUniform (
    this=this@entry=0x85cf00, 
    NumGlobal_Elements=NumGlobal_Elements@entry=2000000000000002, 
    Element_Size=Element_Size@entry=1, 
    Index_Base=Index_Base@entry=1000000000000791, comm=..., 
    IsLongLong=IsLongLong@entry=true)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:81
#7  0x00000000004c708e in Epetra_BlockMap::Epetra_BlockMap (this=0x85cf00, 
    NumGlobal_Elements=2000000000000002, Element_Size=1, 
    Index_Base=1000000000000791, comm=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:124
#8  0x0000000000497de9 in Epetra_Map::Epetra_Map (this=0x85cf00, 
    numGlobalElements=<optimized out>, indexBase=<optimized out>, comm=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetr---Type <return> to continue, or q <return> to quit---
a_Map.cpp:68
#9  0x00000000004c008f in Epetra_BasicDirectory::Generate<long long> (
    this=0x85cea0, Map=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:276
#10 0x00000000004bf4c5 in Epetra_BasicDirectory::Epetra_BasicDirectory (
    this=0x85cea0, Map=..., __in_chrg=<optimized out>, 
    __vtt_parm=<optimized out>)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:121
#11 0x00000000004b1ea1 in Epetra_MpiComm::CreateDirectory (
    this=<optimized out>, map=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_MpiComm.cpp:243
#12 0x00000000004c5fcc in Epetra_BlockMap::RemoteIDList (
    this=this@entry=0x7fffffffddd0, NumIDs=NumIDs@entry=0, GIDList=0x0, 
    PIDList=0x0, LIDList=LIDList@entry=0x0, SizeList=0x0)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:1336
#13 0x00000000004d741b in Epetra_CrsGraph::MakeColMap_LL (
    this=this@entry=0x7fffffffddc0, domainMap=..., rangeMap=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1664
---Type <return> to continue, or q <return> to quit---
#14 0x00000000004d81c9 in Epetra_CrsGraph::MakeColMap (
    this=this@entry=0x7fffffffddc0, domainMap=..., rangeMap=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1764
#15 0x00000000004d83e7 in Epetra_CrsGraph::MakeIndicesLocal (
    this=this@entry=0x7fffffffddc0, domainMap=..., rangeMap=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1784
#16 0x0000000000462bcb in Epetra_CrsMatrix::FillComplete (
    this=this@entry=0x7fffffffdd50, domain_map=..., range_map=..., 
    OptimizeDataStorage=OptimizeDataStorage@entry=true)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1159
#17 0x0000000000462e81 in Epetra_CrsMatrix::FillComplete (
    this=this@entry=0x7fffffffdd50, 
    OptimizeDataStorage=OptimizeDataStorage@entry=true)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1142
#18 0x000000000045a02c in read_and_construct (part=2, total_parts=4)
    at /home/myusername/mpi/myprogram/main.cpp:99
#19 0x0000000000458905 in main (argc=1, argv=0x7fffffffdfe8)

程序源代码:

#include <Epetra_config.h>

#ifdef HAVE_MPI
#  include <mpi.h>
#  include <Epetra_MpiComm.h>
#else
#  include <Epetra_SerialComm.h>
#endif // HAVE_MPI

#include <Epetra_Map.h>
#include <Epetra_CrsMatrix.h>
#include <Epetra_Vector.h>
#include <Epetra_Version.h>
#include <Epetra_DistObject.h>
#include <Epetra_Export.h>
#include <Epetra_Util.h>
#include <unistd.h>

#include <stdexcept>


//std libs
#include <cstdio>
#include <vector>


using namespace std;

typedef long long global_ordinal_type;



int pid;
int np;

char *path = "/home/tianxiaochen01/matrix_small.txt";
typedef long long LL;
typedef long long *  T_LLP;


#ifdef HAVE_MPI
    Epetra_MpiComm * comm;
#else
    Epetra_SerialComm* comm;
#endif

// C style
void read_and_construct(int part,int total_parts){
    FILE * matrixfile;
    matrixfile = fopen(path,"r");
    int len = 0;
    long long src;
    vector< T_LLP > arrdst;
    vector< double * > arrvalue;
    vector< LL > myids;
    vector< int > lens;

    while (fscanf(matrixfile,"%lld %d ",&src,&len)  != EOF ){
        T_LLP dsts = new LL [ len ];
        double * values = new double [ len ];
        long long dst;
        double value;
        for (int i=0;i<len;i++){
            fscanf(matrixfile,"%lld %lf",&dst,&value);
            dsts[i] = dst;
            values[i] = value;
        }
        if ( src  % (LL)total_parts == (LL)part  ) {//is my part
            myids.push_back(src);
            lens.push_back(len);
            arrdst.push_back(dsts );
            arrvalue.push_back(values);
        }
        else {
            delete [] dsts;
            delete [] values;
        }
    }

    fclose(matrixfile);

    T_LLP arrmap = new LL [ myids.size() ];
    for (int i=0;i<myids.size();i++){
        arrmap[i] = myids[i];
    }
    Epetra_Map map((LL)-1, (int)myids.size(), arrmap ,(LL)0, *comm );

    Epetra_Vector v1(map);
    int avg_col_size = 1000;


    Epetra_CrsMatrix M(Copy,map,avg_col_size);
    //insert values into matrix
    for (int i=0;i<myids.size();i++){
        // (long long GlobalRow, int NumEntries, const double *Values, const long long *Indices)
        int e = M.InsertGlobalValues(myids[i],lens[i],arrvalue[i],arrdst[i]);
    }

    try
    {
        M.FillComplete();
    } catch (const Epetra_Object& ex) {
        cout<<"ERROR"<<endl;
        cout<<ex<<endl;
    }
    cout<<M<<endl;

}

void init(const Epetra_Comm& comm){
    pid = comm.MyPID();
    np = comm.NumProc();

}


int
main (int argc, char *argv[])
{
  using std::cout;
  using std::endl;

#ifdef HAVE_MPI
  MPI_Init (&argc, &argv);
  comm = new Epetra_MpiComm (MPI_COMM_WORLD);
  init(*comm);
#else
  comm = new Epetra_SerialComm;
  pid = 0;
  np = 1;
#endif // HAVE_MPI

    read_and_construct(pid,np);

#ifdef HAVE_MPI
  (void) MPI_Finalize ();
#endif // HAVE_MPI

  return 0;
}

Trilinos 版本:12.0 MPI:mpich

4

1 回答 1

1

这听起来像两件事。一个 MPI_INT 是常规的 c int,几乎在每个平台上都是 32 位。因此,如果允许使用巨型标识符,则 trillinios 将不得不将它们与更新的(MPI-2,第 10.2.5 节)MPI_INT64_T 类型一起发送

其次,也许 Trillinios 可以正常发送这些,但是当您看到“错误。每个处理器上的元素空间不足。”时,这表明 Trillinios 正在分配一个密集数组来保存这些大值,并且您的进程内存不足. 单处理器情况可能有效,因为没有中间节点需要担心。

于 2015-06-23T13:53:48.833 回答