chapel - 在 Chapel 中计算矩阵的 rowSums

Question

继续我的教堂冒险...

我有一个矩阵A。

var idx = {1..n};
var adom = {idx, idx};
var A: [adom] int;
//populate A;

var rowsums: [idx] int;

填充行和的最有效方法是什么？

score 2 · Accepted Answer

最有效的解决方案很难定义。但是，这是一种rowsums并行且优雅的计算方法：

config const       n = 8;          // "naked" n would cause compilation to fail
const indices = 1..n;              // tio.chpl:1: error: 'n' undeclared (first use this function)
const adom = {indices, indices};
var A: [adom] int;

// Populate A
[(i,j) in adom] A[i, j] = i*j;

var rowsums: [indices] int;


forall i in indices {
  rowsums[i] = + reduce(A[i, ..]);
}

writeln(rowsums);

在线尝试！

这是利用对数组切片的+ 减少。A

请注意，和都在forall上面+ reduce的程序中引入了并行性。如果的大小足够小，则仅使用for循环可能会更有效，从而避免产生任务的开销。indices

score 1 · Accepted Answer

使代码在和模式下
实际运行的一些提示：`SEQPAR`

除了一些实现细节之外，上述@bencray关于假定的设置的间接费用的假设PAR可能有利于设置中的纯串行处理SEQ，但尚未得到实验证实。在这里还需要注意的是，由于明显的原因，分布式模式没有在现场<TiO>-IDE进行测试，而小规模的分布式实现则远非一个有科学意义的实验，而是一个矛盾的说法。

事实很重要

一个rowsums[]处理，即使在最小可能的规模下2x2，在SEQ模式下也比在模式下的256x256处理要慢PAR。

干得好，教堂团队，在最大限度地利用紧凑型硅资源的最佳对齐方面确实取得了很酷的结果PAR！

有关确切的运行时性能的记录，（参考自我记录的表格），或者不要犹豫访问实时 IDE 运行（上面的参考）并自己进行实验。

<SECTION-UNDER-TEST>读者也可能会在小规模实验中认识到外部噪音，因为与 O/S 和托管 IDE 相关的进程会通过不利的 CPU / Lx-CACHE / memIO / process / et al 冲突干预资源使用并影响运行时性能，这一事实排除了这些测量值被用于一些广义解释。

希望所有人都能享受在_不断_{扩大的计算领域}所展示的可爱`[TIME]`结果
_[EXPSPACE]

/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ use Time;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_SEQ: Timer;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_PAR: Timer;

//nst max_idx =    123456;                   // seems to be too fat  for <TiO>-IDE to allocate                  <TiO>--   /wrappers/chapel: line 6: 24467 Killed
const max_idx =      4096;
//nst max_idx =      8192;                   // seems to be too long for <TiO>-IDE to let it run [SEQ] part     <TiO>--  The request exceeded the 60 second time limit and was terminated
//nst max_idx =     16384;                   // seems to be too long for <TiO>-IDE to let it run [PAR] part too <TiO>--   /wrappers/chapel: line 6: 12043 Killed
const indices = 1..max_idx;

const   adom  = {indices, indices};
var A: [adom] int;

[(i,j) in adom] A[i, j] = i*j;               // Populate A[,]

var rowsums: [indices] int;

/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.start();
for       i in indices {                     // SECTION-UNDER-TEST--
  rowsums[i] = + reduce(A[i, ..]);           // SECTION-UNDER-TEST--
}                                            // SECTION-UNDER-TEST--
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.stop();

/* 
                                               <SECTION-UNDER-TEST> took     8973 [us] to run in [SEQ] mode for    2 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took    28611 [us] to run in [SEQ] mode for    4 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took    58824 [us] to run in [SEQ] mode for    8 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   486786 [us] to run in [SEQ] mode for   64 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  1019990 [us] to run in [SEQ] mode for  128 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  2010680 [us] to run in [SEQ] mode for  256 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  4154970 [us] to run in [SEQ] mode for  512 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  8260960 [us] to run in [SEQ] mode for 1024 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 15853000 [us] to run in [SEQ] mode for 2048 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 33126800 [us] to run in [SEQ] mode for 4096 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took      n/a [us] to run in [SEQ] mode for 8192 elements on <TiO>-IDE

   ============================================ */


/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.start();
forall    i in indices {                     // SECTION-UNDER-TEST--
  rowsums[i] = + reduce(A[i, ..]);           // SECTION-UNDER-TEST--
}                                            // SECTION-UNDER-TEST--
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.stop();
/*
                                               <SECTION-UNDER-TEST> took  12131 [us] to run in [PAR] mode for    2 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8095 [us] to run in [PAR] mode for    4 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8023 [us] to run in [PAR] mode for    8 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8156 [us] to run in [PAR] mode for   64 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   7990 [us] to run in [PAR] mode for  128 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8692 [us] to run in [PAR] mode for  256 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  15134 [us] to run in [PAR] mode for  512 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  16926 [us] to run in [PAR] mode for 1024 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  30671 [us] to run in [PAR] mode for 2048 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 105323 [us] to run in [PAR] mode for 4096 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 292232 [us] to run in [PAR] mode for 8192 elements on <TiO>-IDE

   ============================================ */



writeln( rowsums,
        "\n <SECTION-UNDER-TEST> took ", aStopWATCH_SEQ.elapsed( Time.TimeUnits.microseconds ), " [us] to run in [SEQ] mode for ", max_idx, " elements on <TiO>-IDE",
        "\n <SECTION-UNDER-TEST> took ", aStopWATCH_PAR.elapsed( Time.TimeUnits.microseconds ), " [us] to run in [PAR] mode for ", max_idx, " elements on <TiO>-IDE"
         );

这就是使教堂如此伟大的原因

感谢您为 HPC 开发和改进如此出色的计算工具。

chapel - 在 Chapel 中计算矩阵的 rowSums

2 回答 2

使代码在和模式下实际运行的一些提示：SEQPAR

事实很重要

希望所有人都能享受在不断扩大的计算领域所展示的可爱[TIME]结果[EXPSPACE]

这就是使教堂如此伟大的原因

Related

Reference

使代码在和模式下
实际运行的一些提示：`SEQPAR`

希望所有人都能享受在_不断_{扩大的计算领域}所展示的可爱`[TIME]`结果
_[EXPSPACE]