1

我是 C++ 编程的新手。我试图看到将我所有的 MatLab 软件迁移到 C++ 的好处。我正在做一些有限元的东西,主要是非线性的,所以我需要大量执行的操作之一是两个向量的叉积。我在 Matlab 和 C++ 中测试了两个实现,C++ 似乎要快得多。在 C++ 中,两种不同的实现给出了不同的时序。我正在使用英特尔 MKL。

这是代码:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
#include <mkl.h>


void vprod( double vgr[3], double vg1[3], double vg2[3]);


int main() {

    double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
    int LC=1000000;
    int i,j,k;
    double tiempo=0.0, tinicial;

    //------------------------------------------------------------------------
    std::cout << "INLINE METHOD: " << std::endl;

    tinicial = dsecnd();
        for (i=0; i<LC; i++){   
        vr[0] = v1[1]*v2[2]-v1[2]*v2[1]; 
        vr[1] =-(v1[0]*v2[2]-v1[2]*v2[0]);
        vr[2] = v1[0]*v2[1]-v1[1]*v2[0];
    };

    tiempo = (dsecnd() - tinicial);
    std::cout << "Tiempo Total: " << tiempo << std::endl;
    std::cout << "Resultado: " << vr[0] << std::endl;
    //------------------------------------------------------------------------

    //------------------------------------------------------------------------
    std::cout << "FUNCTION METHOD: " << std::endl;

    tinicial = dsecnd();
        for (i=0; i<LC; i++){   
        vprod (vr,v1,v2);
    };

    tiempo = (dsecnd() - tinicial);
    std::cout << "Tiempo Total: " << tiempo << std::endl;
    std::cout << "Resultado: " << vr[0] << std::endl;
    //------------------------------------------------------------------------

    std::cin.ignore();
    return 0;

}


inline void vprod( double vgr[3], double vg1[3], double vg2[3]){
    vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1]; 
    vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
    vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];

}

我的问题是:为什么第一个实现比第二个快 3 倍?这是函数调用开销的结果吗?谢谢 !!!

编辑:我已经修改了代码,以避免编译器“猜测”带有常量向量的循环的结果。正如@phonetagger 所示,结果非常不同。不使用 vprod 函数有 28500 微秒,使用该函数有 29000 微秒vprod。这个数字是使用 Ox 优化获得的。如果启用了 inline 关键字,则更改优化不会影响比较,尽管数字有所提高。此外,如果不使用 inline 关键字(并且优化已关闭),则不使用 vprod 函数的时间为 32000,使用该函数的时间为 37000。因此函数调用开销可能在 5000 微秒左右。

新代码是:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
#include <mkl.h>

//#include <mkl_lapack.h>

void vprod( double *vgr, int ploc, double *vg1, double *vg2);


int main() {

    int nv=1000000;
    int dim=3*nv;
    double *v1, *v2, *vr; // Declare Pointers
    int ploc, i;
    double tiempo=0.0, tinicial;

     v1 = new double [dim];  //Allocate block of memory
     v2 = new double [dim];
     vr = new double [dim];

// Fill vectors with something
    for (i = 0; i < dim; i++) {
        v1[i] =1.25 +  (double)(i+1);
        v2[i] =2.62+ 2*(double)(i+7);
    }



    //------------------------------------------------------------------------
    std::cout << "RUTINA CON CODIGO INLINE: \n" ;

    tinicial = dsecnd();
    ploc = 0; // ploc points to an intermediate location.
    for (i=0; i<nv; i++){   
        vr[ploc] = v1[ploc+1]*v2[ploc+2]-v1[ploc+2]*v2[ploc+1]; 
        vr[ploc+1] =-(v1[ploc]*v2[ploc+2]-v1[ploc+2]*v2[ploc]);
        vr[ploc+2] = v1[ploc]*v2[ploc+1]-v1[ploc+1]*v2[ploc];
        ploc +=3;
    };

    tiempo = (dsecnd() - tinicial);
    std::cout << "Tiempo Total: " << tiempo << ".\n";
    std::cout << "Resultado: " << vr[0] << ".\n";

    delete v1,v2,vr;

v1 = new double [dim];  //Allocate block of memory
v2 = new double [dim];
vr = new double [dim];
    //------------------------------------------------------------------------

    //------------------------------------------------------------------------
    std::cout << "RUTINA LLAMANDO A FUNCION: \n" ;

    ploc=0;
    tinicial = dsecnd();
        for (i=0; i<nv; i++){   
        vprod ( vr, ploc, v1, v2);
        ploc +=3;
    };

    tiempo = (dsecnd() - tinicial);
    std::cout << "Tiempo Total: " << tiempo << ".\n";
    std::cout << "Resultado: " << vr[0] << ".\n";
    //------------------------------------------------------------------------

    std::cin.ignore();
    return 0;

}


inline void vprod( double *vgr, int ploc, double *vg1, double *vg2) {
        vgr[ploc]    =   vg1[ploc+1]*vg2[ploc+2]-vg1[ploc+2]*vg2[ploc+1]; 
        vgr[ploc+1]  = -(vg1[ploc]*vg2[ploc+2]-vg1[ploc+2]*vg2[ploc]);
        vgr[ploc+2]  =   vg1[ploc]*vg2[ploc+1]-vg1[ploc+1]*vg2[ploc];

}
4

2 回答 2

4

我不知道您使用的是什么编译器(“MKL”是编译器套件吗?),但是无论您使用什么编译器,优化级别都会对您的代码性能产生巨大影响,有时甚至是多个订单幅度,取决于您的编码风格以及您是否尝试“玩花样”以使您的代码运行得更快。通常(尽管并非总是如此)最好让编译器为您发挥作用,而您只需专注于编写高效的算法而不是发挥编码技巧。

无论如何,我以各种方式在我的系统上运行您的代码,结果显示在下面的代码注释中......

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
//#include <mkl.h>

// My standin for dsecnd() since I don't have "mkl.h"...
#include <sys/time.h>
double dsecnd()
{
    struct timeval tv;
    if (gettimeofday(&tv,NULL))
    {
        fprintf(stderr,"\ngettimeofday() error\n\n");
        exit(1);
    }
    return tv.tv_sec*1000000 + tv.tv_usec; // ...returns MICROSECONDS
    //return tv.tv_sec + ((double)tv.tv_usec)/1000000; // ...returns SECONDS
}

//---------------------------------
// Uncomment one or both of these to test variations....
//#define USE_INLINE_KEYWORD
//#define DEFINE_vprod_AT_TOP
//
// Using g++ (GCC) 4.1.2 20080704 (Red Hat 4.1.2-52) on an x86 machine...
//
//                                 microseconds          microseconds
//                               "hardcoded inline"   "via vprod() function"
//                                                     [i]=inlined, [-]=not
//                               ------------------   ----------------------
// inline keyword, at top
//      no optimization                 9501               17797 [-]
//      optimization -O1                   2   (see NOTE)      1 [i]
//      optimization -O2                   1                   1 [i]
//      optimization -O3                   0                   0 [i]
//
// no inline keyword, at top
//      no optimization                 9630               18203 [-]
//      optimization -O1                1257               10681 [-]
//      optimization -O2                1272               10694 [-]
//      optimization -O3                   0                   1 [i]
//
// inline keyword, at bottom
//      no optimization                 9763               18333 [-]
//      optimization -O1                   1                   0 [i]
//      optimization -O2                   2                   1 [i]
//      optimization -O3                   0                   0 [i]
//
// no inline keyword, at bottom
//      no optimization                 9900               18387 [-]
//      optimization -O1                1289               10714 [-]
//      optimization -O2                 795                6740 [-]
//      optimization -O3                   1                   0 [i]
//
// Note that in all cases, both results were reported as -213.458.
//
// NOTE: Especially since I'm using gettimeofday() instead of something
//       that returns process (CPU) time, all results may include some
//       time that the CPU spent processing other stuff, but even if
//       that weren't the case (i.e. even if I used a function that
//       returned only CPU time spent on this particular process), there
//       would still be the quantization error of +/-1 microsecond on
//       each end of the interval, meaning +/-2 microseconds overall.
//
/* My cut & paste "build & test script" to run on the Linux command prompt...

echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""
rm -f a.out; g++ so.cpp
echo ""; echo "No optimization:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O1 so.cpp
echo ""; echo "Optimization -O1:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O2 so.cpp
echo ""; echo "Optimization -O2:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O3 so.cpp
echo ""; echo "Optimization -O3:---------------"; objdump -d a.out | grep call | grep vprod; a.out

...if the "objdump -d a.out | grep call | grep vprod" command returns something
like "call   8048754 <_Z5vprodPdS_S_>", then I know that the call to vprod() is
NOT inlined, whereas if it returns nothing, I know the call WAS inlined.  There
is only one caller of vprod(), so the results can't be confusing.

*/
//
//---------------------------------

#ifdef DEFINE_vprod_AT_TOP
    #ifdef USE_INLINE_KEYWORD
        inline
    #endif
        void vprod( double vgr[3], double vg1[3], double vg2[3]){
        //void vprod( double *vgr, double *vg1, double *vg2){
            vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
            vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
            vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
        }
#else
    // Declare (prototype) the function only if NOT defining it at the top...
    void vprod( double vgr[3], double vg1[3], double vg2[3]);
#endif


int main() {

    double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
    int LC=1000000L;
    int i,j,k;
    double tiempo=0.0, tinicial;

    //------------------------------------------------------------------------
    std::cout << "INLINE METHOD: " << std::endl;
    tinicial = dsecnd();
    for (i=0; i<LC; i++){
        vr[0] = v1[1]*v2[2]-v1[2]*v2[1];
        vr[1] =-(v1[0]*v2[2]-v1[2]*v2[0]);
        vr[2] = v1[0]*v2[1]-v1[1]*v2[0];
    };

    tiempo = (dsecnd() - tinicial);
    std::cout << "Tiempo Total:             " << tiempo << std::endl;
    std::cout << "Resultado: " << vr[0] << std::endl;
    //------------------------------------------------------------------------

    //------------------------------------------------------------------------
    std::cout << "FUNCTION METHOD: " << std::endl;

    tinicial = dsecnd();
    for (i=0; i<LC; i++){
        vprod (vr,v1,v2);
    };

    tiempo = (dsecnd() - tinicial);
    std::cout << "Tiempo Total:             " << tiempo << std::endl;
    std::cout << "Resultado: " << vr[0] << std::endl;
    //------------------------------------------------------------------------

//    std::cin.ignore();
    return 0;

}


#ifndef DEFINE_vprod_AT_TOP
    #ifdef USE_INLINE_KEYWORD
        inline
    #endif
        void vprod( double vgr[3], double vg1[3], double vg2[3]){
        //void vprod( double *vgr, double *vg1, double *vg2){
            vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
            vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
            vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
        }
#endif

现在,编译器使用的编码技巧不会随着优化级别的提高而以线性方式出现。编译器播放的技巧在不同的优化级别上打开,并且可能取决于您是否使用“内联”关键字。除了内联函数之外,编译器可能会采用(我的结果表明存在)不同类型的优化。有趣的是,正如我所读到的,“inline”关键字实际上只是向编译器建议您希望内联函数,并且可能只是调整一些阈值以确定是否内联可能已内联的函数无论如何,如果打开了优化。似乎在关闭优化的情况下,即使使用了“inline”关键字,该函数也根本没有被内联。

于 2012-10-05T17:57:28.387 回答
3

马丁,你是绝对正确的(参考马丁的评论......我的 2012 年 10 月 5 日 17:57 回答下的第三条评论)。是的,似乎在更高的优化级别,编译器允许自己意识到它知道数组的传入值,因此它可以在编译时执行整个计算、循环和所有操作,并完全优化循环。

我将测试代码重新编码为三个单独的文件(一个头文件和两个源文件),并将计算和循环分解为一个单独的函数,以防止编译器对其优化过于智能。现在它无法将循环优化为编译时计算。以下是我的新结果。请注意,我在原来的 0 到 1000000 循环周围添加了另一个循环(0 到 50),然后除以 50。我这样做有两个原因:它允许我们将今天的数字与之前的数字进行比较,它还可以平均不规则由于在测试中间交换进程。这对您来说可能无关紧要,因为我认为 dsecnd() 仅报告其特定进程的 CPU 时间?

无论如何,这是我的新结果.......

(是的,“内联关键字,优化 -O1”比 -O2 或 -O3 快的奇怪结果是可重复的,“没有内联关键字,优化 -O1”的奇怪结果也是如此。我没有深入研究程序集看看为什么会这样。)

//========================================================================================
// File: so.h

void loop_inline( const int LC, double vgr[3], double vg1[3], double vg2[3]);
void loop_func( const int LC, double vgr[3], double vg1[3], double vg2[3]);

//---------------------------------
// Comment or uncomment to test both ways...
#define USE_INLINE_KEYWORD
//
// Using g++ (GCC) 4.1.2 20080704 (Red Hat 4.1.2-52) on an x86 machine...
//
//                                 microseconds          microseconds
//                               "hardcoded inline"   "via vprod() function"
//                                                     [i]=inlined, [-]=not
//                               ------------------   ----------------------
// inline keyword
//      no optimization                11734               14598 [-]
//      optimization -O1                4617                4616 [i]
//      optimization -O2                7754                7838 [i]
//      optimization -O3                7777                7673 [i]
//
// no inline keyword
//      no optimization                11807               14602 [-]
//      optimization -O1                4651                7691 [-]
//      optimization -O2                7755                7383 [-]
//      optimization -O3                7921                7432 [-]
//
// Note that in all cases, both results were reported as -213.458.
//
/* My cut & paste "build & test script" to run on the Linux command prompt...

echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""
rm -f a.out; g++ -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "No optimization:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O1 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O1:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O2 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O2:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O3 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O3:---------------"; objdump -d a.out | grep call | grep vprod; a.out

...if the "objdump -d a.out | grep call | grep vprod" command returns something
like "call   8048754 <_Z5vprodPdS_S_>", then I know that the call to vprod() is
NOT inlined, whereas if it returns nothing, I know the call WAS inlined.

*/

//========================================================================================
// File: so.cpp

// Sorry so messy, I didn't bother to clean up the #includes.......
#include <stdint.h>
#include <inttypes.h>
#include <stddef.h> // for NULL
#include <stdlib.h> // for exit()
#include <stdio.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
//#include <mkl.h>

#include "so.h"

// My standin for dsecnd() since I don't have "mkl.h"...
#include <sys/time.h>
double dsecnd()
{
    struct timeval tv;
    if (gettimeofday(&tv,NULL))
    {
        fprintf(stderr,"\ngettimeofday() error\n\n");
        exit(1);
    }
    return tv.tv_sec*1000000 + tv.tv_usec; // ...returns MICROSECONDS
    //return tv.tv_sec + ((double)tv.tv_usec)/1000000; // ...returns SECONDS
}

//---------------------------------

#ifndef USE_INLINE_KEYWORD
    // We're NOT using the 'inline' keyword, so define vprod() in this
    // file so it can't possibly be inlined where it's called (in the
    // other source file).
    void vprod( double vgr[3], double vg1[3], double vg2[3]){
    //void vprod( double *vgr, double *vg1, double *vg2){
        vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
        vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
        vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
    }
#endif

int main() {

    double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
    int LC=1000000L;
    int i, N=100;
    double tiempo=0.0, tinicial;

    //------------------------------------------------------------------------
    std::cout << "INLINE METHOD: " << std::endl;

    tinicial = dsecnd();

    for (i=0; i<N; ++i)
        loop_inline(LC,vr,v1,v2);

    tiempo = (dsecnd() - tinicial)/N;
    std::cout << "Tiempo Total:             " << tiempo << std::endl;
    std::cout << "Resultado: " << vr[0] << std::endl;
    //------------------------------------------------------------------------

    //------------------------------------------------------------------------
    std::cout << "FUNCTION METHOD: " << std::endl;
    tinicial = dsecnd();

    for (i=0; i<N; ++i)
        loop_func(LC,vr,v1,v2);

    tiempo = (dsecnd() - tinicial)/N;
    std::cout << "Tiempo Total:             " << tiempo << std::endl;
    std::cout << "Resultado: " << vr[0] << std::endl;
    //------------------------------------------------------------------------

//    std::cin.ignore();
    return 0;
}

//========================================================================================
// File: so2.cpp

#include "so.h"

#ifdef USE_INLINE_KEYWORD
    inline void vprod( double vgr[3], double vg1[3], double vg2[3]){
    //void vprod( double *vgr, double *vg1, double *vg2){
        vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
        vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
        vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
    }
#else
    // Not using 'inline' keyword, so just declare (prototype) the
    // function here and define it in the other source file (so it
    // can't possibly be inlined).
    void vprod( double vgr[3], double vg1[3], double vg2[3]);
#endif

void loop_inline( const int LC, double vgr[3], double vg1[3], double vg2[3]){

    for (int i=0; i<LC; i++) {
        vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
        vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
        vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
    }
}

void loop_func( const int LC, double vgr[3], double vg1[3], double vg2[3]){

    for (int i=0; i<LC; i++) {
        vprod (vgr,vg1,vg2);
    }
}
于 2012-10-09T15:13:46.213 回答