后记:其实最后还有第三个令人惊艳的表现惊喜……
表现?
基准!...永远,没有例外,没有借口
这就是使教堂如此伟大的原因。非常感谢 Chapel 团队在过去十年中为 HPC 开发和改进了如此出色的计算工具。
完全热爱真正[PARALLEL]
的努力,性能始终是设计实践和底层系统硬件的结果,从来没有一个公正的语法构造器授予“奖金”。
该norm_reduce()
处理系统地花费几毫秒来设置所有支持并发的reduce
计算设施,以便稍后仅生成单个x**2
产品并将其返回到结果队列中,以进行延迟的中央+
减速器引擎求和。单个 2 CLK CPU uops 的开销相当多,不是吗?
出于原因,请查看流程调度细节的成本以及我对阿姆达尔定律原始表述的最新批评。
代码基准测试 - 实际上一次带来了两个惊喜:
+++++++++++++++++++++++++++++++++++++++++++++++ <TiO.IDE>.RUN
3.74166
[SEQ] norm_loop(): 0.0 [us] -- 3.74166
[SEQ] norm_loop_param(): 0.0 [us] -- 3.74166
[PAR]: norm_reduce(): 5677.0 [us] -- 3.74166
3.74166
[SEQ] norm_loop(): 0.0 [us] -- 3.74166
[SEQ] norm_loop_param(): 1.0 [us] -- 3.74166
[PAR]: norm_reduce(): 5818.0 [us] -- 3.74166
3.74166
[SEQ] norm_loop(): 1.0 [us] -- 3.74166
[SEQ] norm_loop_param(): 2.0 [us] -- 3.74166
[PAR]: norm_reduce(): 4886.0 [us] -- 3.74166
第一个是在原始帖子中报告的,第二个是在 Chapel 运行配备--fast
编译器开关后观察到的:
+++++++++++++++++++++++++++++++++++++++++++++++ <TiO.IDE>.+CompilerFLAG( "--fast" ).RUN
3.74166
[SEQ] norm_loop(): 1.0 [us] -- 3.74166
[SEQ] norm_loop_param(): 2.0 [us] -- 3.74166
[PAR]: norm_reduce(): 7769.0 [us] -- 3.74166
3.74166
[SEQ] norm_loop(): 0.0 [us] -- 3.74166
[SEQ] norm_loop_param(): 0.0 [us] -- 3.74166
[PAR]: norm_reduce(): 9109.0 [us] -- 3.74166
3.74166
[SEQ] norm_loop(): 1.0 [us] -- 3.74166
[SEQ] norm_loop_param(): 1.0 [us] -- 3.74166
[PAR]: norm_reduce(): 8807.0 [us] -- 3.74166
与往常一样,SuperComputing2017 HPC在技术论文或基准测试中发布的每个方面都促进了 [再现性]。
这些结果是在 Try-it-Online 赞助的chapel在线平台上收集的,欢迎所有感兴趣的爱好者重新运行并发布他们的 localhost / cluster 操作的教堂代码的性能细节,以便更好地记录硬件系统依赖上述观察到的时间的可变性(对于准备运行的时序修饰代码的进一步实验,可以使用此链接到 TiO.IDE 的状态完整快照)。
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ use Time;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_SEQ: Timer;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_PAR: Timer;
proc norm_3tuple( x: 3*real ): real
{
return sqrt( x[1]**2 + x[2]**2 + x[3]**2 );
}
proc norm_loop( x ): real
{
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.start();
var tmp = 0.0;
for i in 1 .. x.size do
tmp += x[i]**2;
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.stop(); write( "[SEQ] norm_loop(): ",
aStopWATCH_SEQ.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
return sqrt( tmp );
}
proc norm_loop_param( x ): real
{
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.start();
var tmp = 0.0;
for param i in 1 .. x.size do
tmp += x[i]**2;
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.stop(); write( "[SEQ] norm_loop_param(): ",
aStopWATCH_SEQ.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
return sqrt( tmp );
}
proc norm_reduce( x ): real
{
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.start();
var tmp = ( + reduce x**2 );
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.stop(); write( "[PAR]: norm_reduce(): ",
aStopWATCH_PAR.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
return sqrt( tmp );
}
//.........................................................
var a = ( 1.0, 2.0, 3.0 );
// consistency check
writeln( norm_3tuple( a ) );
writeln( norm_loop( a ) );
writeln( norm_loop_param( a ) );
writeln( norm_reduce( a ) );
[LOOP] norm_3tuple(): 45829.0 [us] -- result = 4.30918e+06 @ 1000000 loops.
[LOOP] norm_3tuple(): 241680 [us] -- result = 4.30918e+07 @ 10000000 loops.
[LOOP] norm_3tuple(): 2387080 [us] -- result = 4.30918e+08 @ 100000000 loops.
[LOOP] norm_loop(): 72160.0 [us] -- result = 4.30918e+06 @ 1000000 loops.
[LOOP] norm_loop(): 755959 [us] -- result = 4.30918e+07 @ 10000000 loops.
[LOOP] norm_loop(): 7783740 [us] -- result = 4.30918e+08 @ 100000000 loops.
[LOOP] norm_loop_param(): 34102.0 [us] -- result = 4.30918e+06 @ 1000000 loops.
[LOOP] norm_loop_param(): 365510 [us] -- result = 4.30918e+07 @ 10000000 loops.
[LOOP] norm_loop_param(): 3480310 [us] -- result = 4.30918e+08 @ 100000000 loops.
-------------------------------------------------------------------------1000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 5851380 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 5884600 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6163690 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6029860 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6083730 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6132720 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6012620 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6379020 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 5923550 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 6144660 [us] -- result = 4309.18 @ 1000 loops.
[LOOP] norm_reduce(): 8098380 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 6215470 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 5831670 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 6124580 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 6092740 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 5811260 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 5880400 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 5898520 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 6591110 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 5876570 [us] -- result = 4309.18 @ 1000 loops. [--fast]
[LOOP] norm_reduce(): 6034180 [us] -- result = 4309.18 @ 1000 loops. [--fast]
-------------------------------------------------------------------------2000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 12434700 [us] -- result = 8618.36 @ 2000 loops.
-------------------------------------------------------------------------3000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 17807600 [us] -- result = 12927.5 @ 3000 loops.
-------------------------------------------------------------------------4000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 23844300 [us] -- result = 17236.7 @ 4000 loops.
-------------------------------------------------------------------------5000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 30557700 [us] -- result = 21545.9 @ 5000 loops.
[LOOP] norm_reduce(): 30523700 [us] -- result = 21545.9 @ 5000 loops.
[LOOP] norm_reduce(): 29404200 [us] -- result = 21545.9 @ 5000 loops.
[LOOP] norm_reduce(): 29268600 [us] -- result = 21545.9 @ 5000 loops. [--fast]
[LOOP] norm_reduce(): 29009500 [us] -- result = 21545.9 @ 5000 loops. [--fast]
[LOOP] norm_reduce(): 30388800 [us] -- result = 21545.9 @ 5000 loops. [--fast]
-------------------------------------------------------------------------6000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 37070600 [us] -- result = 25855.1 @ 6000 loops.
-------------------------------------------------------------------------7000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 42789200 [us] -- result = 30164.3 @ 7000 loops.
---------------------------------------------------------------------8000--------{--fast}---------------------------------------------------------------------
[LOOP] norm_reduce(): 50572700 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 49944300 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 49365600 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): ~60+ // exceeded the 60 seconds limit and was terminated [Exit code: 124]
[LOOP] norm_reduce(): 50099900 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 49445500 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 49783800 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 48533400 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 48966600 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 47564700 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 47087400 [us] -- result = 34473.4 @ 8000 loops.
[LOOP] norm_reduce(): 47624300 [us] -- result = 34473.4 @ 8000 loops. [--fast]
[LOOP] norm_reduce(): ~60+ [--fast] // exceeded the 60 seconds limit and was terminated [Exit code: 124]
[LOOP] norm_reduce(): ~60+ [--fast] // exceeded the 60 seconds limit and was terminated [Exit code: 124]
[LOOP] norm_reduce(): 46887700 [us] -- result = 34473.4 @ 8000 loops. [--fast]
[LOOP] norm_reduce(): 46571800 [us] -- result = 34473.4 @ 8000 loops. [--fast]
[LOOP] norm_reduce(): 46794700 [us] -- result = 34473.4 @ 8000 loops. [--fast]
[LOOP] norm_reduce(): 46862600 [us] -- result = 34473.4 @ 8000 loops. [--fast]
[LOOP] norm_reduce(): 47348700 [us] -- result = 34473.4 @ 8000 loops. [--fast]
[LOOP] norm_reduce(): 46669500 [us] -- result = 34473.4 @ 8000 loops. [--fast]
虽然--ed 代码被相关[SEQ]
的nloops
附加开销严重破坏,但一个轻微的问题重新制定表明即使在单 CPU 平台上也可以实现非常不同的性能水平(多 CPU 代码的性能增益应该越多) -execution )以及--fast
编译器开关在此处产生的效果:
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ use Time;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_LOOP: Timer;
config const nloops = 100000000; // 1E+8
var res: atomic real;
res.write( 0.0 );
//------------------------------------------------------------------// PRE-COMPUTE:
var A1: [1 .. nloops] real; // pre-compute a tuple-element value
forall k in 1 .. nloops do // pre-compute a tuple-element value
A1[k] = (k % 5): real; // pre-compute a tuple-element value to a ( k % 5 ), ex-post typecast to real
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_LOOP.start();
forall i in 1 .. nloops do
{ // a[1] = ( i % 5 ): real; // pre-compute'd
res.add( norm_reduce( ( A1[i], a[1], a[2] ) ) ); // atomic.add()
// res += norm_reduce( ( ( i % 5 ): real, a[1], a[2] ) ); // non-atomic
//:49: note: The shadow variable 'res' is constant due to forall intents in this loop
}/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_LOOP.stop(); write(
"forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: ", aStopWATCH_LOOP.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
/*
--------------------------------------------------------------------------------------------------------{-nloops-}-------{--fast}-------------
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 7911.0 [us] -- result = 320.196 @ 100 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 8055.0 [us] -- result = 3201.96 @ 1000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 8002.0 [us] -- result = 32019.6 @ 10000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 80685.0 [us] -- result = 3.20196e+05 @ 100000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 842948 [us] -- result = 3.20196e+06 @ 1000000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 8005300 [us] -- result = 3.20196e+07 @ 10000000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 40358900 [us] -- result = 1.60098e+08 @ 50000000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 40671200 [us] -- result = 1.60098e+08 @ 50000000 loops.
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 2195000 [us] -- result = 1.60098e+08 @ 50000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4518790 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 6178440 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4755940 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4405480 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4509170 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4736110 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4653610 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4397990 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 4655240 [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
*/