我的应用程序与 LAM/MPI 一起工作,但它与 OpenMPI 一起崩溃。
下面是我的代码的外观。
void Comm::nonblocking_send( int s_idx , int e_idx )
{
MPI_Wait(&mpireq,&mpistat);
buffer.clear();
list<class vertex*>::iterator vit;
for( vit=our_dag->cur_block_intmeds.begin() ; vit!=our_dag->cur_block_intmeds.end() ; vit++ )
{
vertex * v = (*vit);
list<class edge*> in_edges = v->in_edges;
list<class edge*>::iterator eit;
for( eit=in_edges.begin() ; eit!=in_edges.end() ; eit++ )
{
int x_idx = (*eit)->src->idx;
int y_idx = (*eit)->tgt->idx;
double dydx = (*eit)->partial;
struct partial * p = new partial();
//ownership info
p->rank = our_dag->rank;
//structural info
p->x_idx = x_idx;
p->y_idx = y_idx;
p->dydx = dydx;
//block info
p->block_idx = our_dag->block_idx;
p->s_idx = s_idx;
p->e_idx = e_idx;
buffer.push_back(*p);
delete p;
}
}
MPI_Isend( &buffer[0] , buffer.size() , MPI_PARTIAL , 0 , DAG_MERG_REQ , MPI_COMM_WORLD , &mpireq );
}
如您所见,在函数开始时,调用 MPI_Wait,然后进行一些计算,最终在函数结束时调用相应的 MPI_ISend。
每次使用 OpenMPI 运行时,我都会从 MPI_Wait 中不断收到分段错误。
我通过检查函数是否是第一次使用布尔变量*first_time*调用来解决此问题,如下所示。
void Comm::nonblocking_send( int s_idx , int e_idx )
{
if(first_time)
first_time = false;
else
MPI_Wait(&mpireq,&mpistat);
buffer.clear();
list<class vertex*>::iterator vit;
for( vit=our_dag->cur_block_intmeds.begin() ; vit!=our_dag->cur_block_intmeds.end() ; vit++ )
{
vertex * v = (*vit);
list<class edge*> in_edges = v->in_edges;
list<class edge*>::iterator eit;
for( eit=in_edges.begin() ; eit!=in_edges.end() ; eit++ )
{
int x_idx = (*eit)->src->idx;
int y_idx = (*eit)->tgt->idx;
double dydx = (*eit)->partial;
struct partial * p = new partial();
//ownership info
p->rank = our_dag->rank;
//structural info
p->x_idx = x_idx;
p->y_idx = y_idx;
p->dydx = dydx;
//block info
p->block_idx = our_dag->block_idx;
p->s_idx = s_idx;
p->e_idx = e_idx;
buffer.push_back(*p);
delete p;
}
}
MPI_Isend( &buffer[0] , buffer.size() , MPI_PARTIAL , 0 , DAG_MERG_REQ , MPI_COMM_WORLD , &mpireq );
}
这里有人知道这个错误吗?
干杯。