c++ - 动态链接时英特尔 PARDISO 分解速度较慢

Question

我们有一个运行 PARDISO 求解器的代码。当我们静态链接此代码时，分解步骤比动态链接相同代码时快 2 倍。

以下是两种情况下从 CMake 获得的链接线（我使用了 MKL 链接线顾问来帮助我定义参数）：

静态链接：

/opt/intel/compilers_and_libraries_2016/linux/bin/intel64/icpc    -rdynamic CMakeFiles/simplesolver.dir/pardiso_sym_c.cpp.o  -o simplesolver -Wl,--start-group /opt/intel/compilers_and_libraries_2016/linux/mkl/lib/intel64/libmkl_intel_lp64.a /opt/intel/compilers_and_libraries_2016/linux/mkl/lib/intel64/libmkl_intel_thread.a /opt/intel/compilers_and_libraries_2016/linux/mkl/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl

动态链接：

/opt/intel/compilers_and_libraries_2016/linux/bin/intel64/icpc   
-rdynamic CMakeFiles/simplesolver.dir/pardiso_sym_c.cpp.o  -o simplesolver
-L/opt/intel/compilers_and_libraries_2016/linux/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl

这有什么已知的问题吗？或者我们缺少一些编译/链接标志来提高分解性能？代码完全相同（来自 MKL 发行版的求解器示例）。我们唯一改变的是如何链接，然后我们获得了运行速度的这个巨大差异。

我们在 Linux (Ubuntu 14.04) 下使用 Intel Compiler C++ 2016 并使用其中的 MKL。我们通过查看 PARDISO (msglvl=1) 的输出来测量时间。

只有当它有帮助时，这才是代码（我省略了从文件中读取矩阵信息的函数 readData）。

#include "mkl_pardiso.h"
#include "mkl_types.h"

#include <cmath>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <string>


MKL_INT main (void)
{
        int n; // dimension of matrix
    int nnz; // number of non zeroes
    int* ia; // coordinates in i of each value in a
    int* ja; // coordinates in j of each value in a
    double* a; // values of the matrix
    double* b; // vector of forces
    double* x_expected; // computed solution our software (Ax=rhs)
    double* x; // computed solution in pardiso

    bool result = false;

    std::string fileIn = "problemData.bin";

    //METHOD THAT FILLS ALL THE VALUES... NOT RELEVANT.
    result = readData(fileIn, n, nnz, &ia, &ja, &a, &b, &x_expected, false);
    x = new double[n];
    if (!result)
        return 1;



    MKL_INT mtype = -2;       /* Real symmetric matrix */
    /* RHS and solution vectors. */

    MKL_INT nrhs = 1;     /* Number of right hand sides. */
    /* Internal solver memory pointer pt, */
    /* 32-bit: int pt[64]; 64-bit: long int pt[64] */
    /* or void *pt[64] should be OK on both architectures */
    void *pt[64];
    /* Pardiso control parameters. */
    MKL_INT iparm[64];
    MKL_INT maxfct, mnum, phase, error, msglvl;
    /* Auxiliary variables. */
    MKL_INT i;
    double ddum;          /* Double dummy */
    MKL_INT idum;         /* Integer dummy. */
/* -------------------------------------------------------------------- */
/* .. Setup Pardiso control parameters. */
/* -------------------------------------------------------------------- */
    for ( i = 0; i < 64; i++ )
    {
        iparm[i] = 0;
    }
    iparm[0] = 1;         /* No solver default */
    iparm[1] = 2;         /* Fill-in reordering from METIS */
    iparm[3] = 0;         /* No iterative-direct algorithm */
    iparm[4] = 0;         /* No user fill-in reducing permutation */
    iparm[5] = 0;         /* Write solution into x */
    iparm[6] = 0;         /* Not in use */
    iparm[7] = 2;         /* Max numbers of iterative refinement steps */
    iparm[8] = 0;         /* Not in use */
    iparm[9] = 13;        /* Perturb the pivot elements with 1E-13 */
    iparm[10] = 1;        /* Use nonsymmetric permutation and scaling MPS */
    iparm[11] = 0;        /* Not in use */
    iparm[12] = 0;        /* Maximum weighted matching algorithm is switched-off (default for symmetric). Try iparm[12] = 1 in case of inappropriate accuracy */
    iparm[13] = 0;        /* Output: Number of perturbed pivots */
    iparm[14] = 0;        /* Not in use */
    iparm[15] = 0;        /* Not in use */
    iparm[16] = 0;        /* Not in use */
    iparm[17] = -1;       /* Output: Number of nonzeros in the factor LU */
    iparm[18] = -1;       /* Output: Mflops for LU factorization */
    iparm[19] = 0;        /* Output: Numbers of CG Iterations */
    maxfct = 1;           /* Maximum number of numerical factorizations. */
    mnum = 1;         /* Which factorization to use. */
    msglvl = 1;           /* Print statistical information in file */
    error = 0;            /* Initialize error flag */
/* -------------------------------------------------------------------- */
/* .. Initialize the internal solver memory pointer. This is only */
/* necessary for the FIRST call of the PARDISO solver. */
/* -------------------------------------------------------------------- */
    for ( i = 0; i < 64; i++ )
    {
        pt[i] = 0;
    }
/* -------------------------------------------------------------------- */
/* .. Reordering and Symbolic Factorization. This step also allocates */
/* all memory that is necessary for the factorization. */
/* -------------------------------------------------------------------- */
    phase = 11;
    PARDISO (pt, &maxfct, &mnum, &mtype, &phase,
             &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &error);
    if ( error != 0 )
    {
        printf ("\nERROR during symbolic factorization: %d", error);
        return -1;
    }
    printf ("\nReordering completed ... ");
    printf ("\nNumber of nonzeros in factors = %d", iparm[17]);
    printf ("\nNumber of factorization MFLOPS = %d", iparm[18]);
/* -------------------------------------------------------------------- */
/* .. Numerical factorization. */
/* -------------------------------------------------------------------- */
    phase = 22;
    PARDISO (pt, &maxfct, &mnum, &mtype, &phase,
             &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &error);
    if ( error != 0 )
    {
        printf ("\nERROR during numerical factorization: %d", error);
        return -2;
    }
    printf ("\nFactorization completed ... ");
/* -------------------------------------------------------------------- */
/* .. Back substitution and iterative refinement. */
/* -------------------------------------------------------------------- */
    phase = 33;
    iparm[7] = 2;         /* Max numbers of iterative refinement steps. */

    PARDISO (pt, &maxfct, &mnum, &mtype, &phase,
             &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, b, x, &error);
    if ( error != 0 )
    {
        printf ("\nERROR during solution: %d", error);
        return -3;
    }
    printf ("\nSolve completed ... ");
/* -------------------------------------------------------------------- */
/* .. Termination and release of memory. */
/* -------------------------------------------------------------------- */
    phase = -1;           /* Release internal memory. */
    PARDISO (pt, &maxfct, &mnum, &mtype, &phase,
             &n, &ddum, ia, ja, &idum, &nrhs,
             iparm, &msglvl, &ddum, &ddum, &error);


    delete [] ia;
    delete [] ja;
    delete [] a;
    delete [] b;
    delete [] x;
    delete [] x_expected;

    return 0;
}

score 0 · Accepted Answer

您已开启 0 项优化。然后，许多因素会导致与 MKL / 动态与静态链接等完全无关的运行时行为。当您使用 , 编译上述代码时会发生-O2 -g -march=native什么-O3 -march=native？差异应该完全消失。

c++ - 动态链接时英特尔 PARDISO 分解速度较慢

1 回答 1

Related

Reference