0

我正在尝试并行化我正在设计 FIR 滤波器的代码。为此我选择了 parallel_reduce。当我在 Windows 上执行代码时需要 15 秒,而在 Linux 上执行相同的代码需要将近 2.5 秒。在 Windows 中我是在启用了英特尔性能库 TBB 的 VS 2010 上执行代码,在 Linux 中,我通过包含 TBB 库和 g++ 编译器来通过终端进行编译。由于处理器相同并且代码也将在相同的处理器上执行,为什么这个操作系统会有所不同?

我使用的代码是:

#include<iostream> 
#include "tbb/task_scheduler_init.h" 
#include "tbb/parallel_for.h" 
#include "tbb/blocked_range.h" 
#include "tbb/compat/thread" 
#include "tbb/parallel_reduce.h" 
#include <math.h>
#include <fstream>
using namespace tbb; 
using namespace std; 

#define pi 3.141593
#define FILTER_LEN 265

double coeffs[ FILTER_LEN ] =
{
  0.0033473431384214393,0.000032074683390218124,0.0033131082058404943,0.0024777666109278788,
  -0.0008968429179843104,-0.0031973449396977684,-0.003430943381749411,-0.0029796565504781646,
  -0.002770673157048994,-0.0022783059845596586,-0.0008531818129514857,0.001115432556294998,
  0.0026079871108133294,0.003012423848769931,0.002461420635709332,0.0014154004589753215,
  0.00025190669718400967,-0.0007608257014963959,-0.0013703600874774068,-0.0014133823230551277,
  -0.0009759556503342884,-0.00039687498737139273,-0.00007527524701314324,-0.00024181463305012626,
  -0.0008521761947454302,-0.00162618205097997,-0.002170446498273018,-0.002129903305507943,
  -0.001333859049002249,0.00010700092934983156,0.0018039564602637683,0.0032107930896349583,
  0.0038325849735515363,0.003416201274366522,0.002060848732332109,0.00017954815260431595,
  -0.0016358832300944531,-0.0028402136847527387,-0.0031256650498727384,-0.0025374271571154713,
  -0.001438370315670195,-0.00035115295209013755,0.0002606730012030533,0.0001969569787142967,
  -0.00039635535951198597,-0.0010886127490608972,-0.0013530057243606405,-0.0008123200399262436,
  0.0005730271959526784,0.0024419465938120906,0.004133717273258681,0.0049402122577746265,
  0.0043879285604252714,0.002449549610687005,-0.00040283102645093463,-0.003337730734820209,
  -0.0054508346511294775,-0.006093057767824609,-0.005117609782189977,-0.0029293645861970417,
  -0.0003251033117661085,0.0018074390555649442,0.0028351284091668164,0.002623563404428517,
  0.0015692864792199496,0.0004127664681096788,-0.00009249878881824428,0.0004690173244168184,
  0.001964334172374759,0.0037256715492873485,0.004809640399145206,0.004395274594482053,
  0.0021650921193604,-0.0014888595443799124,-0.005534807968511709,-0.008642334104607624,
  -0.009668950651149259,-0.008104732391434574,-0.004299972815463919,0.0006184612821881392,
  0.005136551428636121,0.007907786753766152,0.008241212326068366,0.00634786595941524,
  0.003235610213062744,0.00028882736660937287,-0.001320994685952108,-0.0011237433853145615,
  0.00044213409507615003,0.0022057106517524255,0.00277593527678719,0.0011909915058737617,
  -0.0025807757230413447,-0.007497632882437637,-0.011739520895818884,-0.013377018279057393,
  -0.011166543231844196,-0.005133056165990026,0.0032948631959114935,0.011673660427968408,
  0.017376415708412904,0.018548938130314566,0.014811760899506572,0.007450782505155853,
  -0.001019540069785369,-0.007805775815783898,-0.010898333714715424,-0.00985364043415772,
  -0.005988406030111452,-0.001818560524968024,0.000028552677472614846,-0.0019938756495376363,
  -0.007477684025727061,-0.013989430449615033,-0.017870518868849213,-0.015639422062597726,
  -0.005624959109456065,0.010993528170353541,0.03001263681283932,0.04527492462846608,
  0.050581340787164114,0.041949186532860346,0.019360612460662185,-0.012644336735920483,
  -0.0458782599058412,-0.07073838953156347,-0.0791205623455818,-0.06709535677423759,
  -0.03644544574795176,0.005505370370858695,0.04780486657828151,0.07898800597378192,
  0.0904453420042807,0.07898800597378192,0.04780486657828151,0.005505370370858695,
  -0.03644544574795176,-0.06709535677423759,-0.0791205623455818,-0.07073838953156347,
  -0.0458782599058412,-0.012644336735920483,0.019360612460662185,0.041949186532860346,
  0.050581340787164114,0.04527492462846608,0.03001263681283932,0.010993528170353541,
  -0.005624959109456065,-0.015639422062597726,-0.017870518868849213,-0.013989430449615033,
  -0.007477684025727061,-0.0019938756495376363,0.000028552677472614846,-0.001818560524968024,
  -0.005988406030111452,-0.00985364043415772,-0.010898333714715424,-0.007805775815783898,
  -0.001019540069785369,0.007450782505155853,0.014811760899506572,0.018548938130314566,
  0.017376415708412904,0.011673660427968408,0.0032948631959114935,-0.005133056165990026,
  -0.011166543231844196,-0.013377018279057393,-0.011739520895818884,-0.007497632882437637,
  -0.0025807757230413447,0.0011909915058737617,0.00277593527678719,0.0022057106517524255,
  0.00044213409507615003,-0.0011237433853145615,-0.001320994685952108,0.00028882736660937287,
  0.003235610213062744,0.00634786595941524,0.008241212326068366,0.007907786753766152,
  0.005136551428636121,0.0006184612821881392,-0.004299972815463919,-0.008104732391434574,
  -0.009668950651149259,-0.008642334104607624,-0.005534807968511709,-0.0014888595443799124,
  0.0021650921193604,0.004395274594482053,0.004809640399145206,0.0037256715492873485,
  0.001964334172374759,0.0004690173244168184,-0.00009249878881824428,0.0004127664681096788,
  0.0015692864792199496,0.002623563404428517,0.0028351284091668164,0.0018074390555649442,
  -0.0003251033117661085,-0.0029293645861970417,-0.005117609782189977,-0.006093057767824609,
  -0.0054508346511294775,-0.003337730734820209,-0.00040283102645093463,0.002449549610687005,
  0.0043879285604252714,0.0049402122577746265,0.004133717273258681,0.0024419465938120906,
  0.0005730271959526784,-0.0008123200399262436,-0.0013530057243606405,-0.0010886127490608972,
  -0.00039635535951198597,0.0001969569787142967,0.0002606730012030533,-0.00035115295209013755,
  -0.001438370315670195,-0.0025374271571154713,-0.0031256650498727384,-0.0028402136847527387,
  -0.0016358832300944531,0.00017954815260431595,0.002060848732332109,0.003416201274366522,
  0.0038325849735515363,0.0032107930896349583,0.0018039564602637683,0.00010700092934983156,
  -0.001333859049002249,-0.002129903305507943,-0.002170446498273018,-0.00162618205097997,
  -0.0008521761947454302,-0.00024181463305012626,-0.00007527524701314324,-0.00039687498737139273,
  -0.0009759556503342884,-0.0014133823230551277,-0.0013703600874774068,-0.0007608257014963959,
  0.00025190669718400967,0.0014154004589753215,0.002461420635709332,0.003012423848769931,
  0.0026079871108133294,0.001115432556294998,-0.0008531818129514857,-0.0022783059845596586,
  -0.002770673157048994,-0.0029796565504781646,-0.003430943381749411,-0.0031973449396977684,
  -0.0008968429179843104,0.0024777666109278788,0.0033131082058404943,0.000032074683390218124,
  0.0033473431384214393
};



class SumFoo 
{ 
    double* my_a; 

    public: 
    double sum; 
        static int count;
        int ip,nip;
    void operator( )( const blocked_range<size_t>& r ) 
    { 
        double *a = my_a; 
       //   cout<<"id of thread is \t"<<this_thread::get_id()<<endl; 
        // cout<<"r.begin is "<<r.begin()<<"\t r.end is "<<r.end()<<endl; 
        ip=( FILTER_LEN - 1 + (SumFoo::count));
        for( size_t k=r.begin(); k!=r.end( ); ++k ) 
        {           
            nip=ip-k;
            sum+= ((coeffs[k]) * (a[nip]));                                       
         }
    }  

    SumFoo( SumFoo& x, split ) : my_a(x.my_a), sum(0) 
    { 
        //cout<<"split Constructor called"<<endl; 
    } 

    void join( const SumFoo& y ) 
    { 
        // cout<<"Joining all the sums"<<endl; 
        sum+=y.sum; 
    } 

    SumFoo(double a[] ) :my_a(a), sum(0) 
    { 
            // cout<<"Constructor called"<<endl; 
    } 
}; 

void ParallelSumFoo(double *a, size_t n ,ofstream &o) 
{ 
        SumFoo sf(a); 
        for(int j=264;j<150264;j++)
        {
                SumFoo::count=j-264;
                parallel_reduce(blocked_range<size_t>(0,265), sf,auto_partitioner() ); 
              o<<j<<","<<sf.sum<<endl;
        }

} 

int SumFoo::count=0;

int main() 
{ 

     ofstream o("400hzreduce.csv");

    double *buffer=new double[150264];  
    fill_n(buffer,150264,0);

    tick_count t0=tick_count::now(); 
    for(int i=264;i<150264;i++) 
    { 
        buffer[i] = sin(400 * (2 * pi) * (i / 5000.0));
        o<<i<<","<<buffer[i]<<endl;
    } 


    cout<<fixed; 


    ParallelSumFoo(buffer,150264,o);
    tick_count t1=tick_count::now(); 

    double t9=(t1-t0).seconds(); 
    cout<<"Time Taken for parallel execution is \t"<<t9<<"seconds"<<endl; 

}

请帮助找出我哪里出错了?

4

2 回答 2

2

您在两个操作系统上都有类似的编译器优化选项,不是吗?-O3 对比。gcc 没有任何东西可以产生这种影响。使用visual studio我不太确定选项,但我相信你可以通过GUI搜索并找到它们。

在没有 parallel_reduce 的情况下,您在两个系统上的运行时间是多少?这将降低 1 级的复杂性。

您是否尝试过分析您的代码?我推荐valgrind --tool=callgrind和 kcachegrind 在 Linux 中查看结果。这应该有助于缩小人们的反应范围。

于 2013-04-30T06:41:05.803 回答
0

在这段代码中,数据被写入文件,这在执行时间上产生了巨大的差异。将数据写入文件所用的时间在 linux 和 windows 中是不同的,这就是为什么时间不同,否则 TBB 没有任何区别。

于 2013-05-06T09:10:05.530 回答