如果要构建单个矩阵,则正在寻找共享内存并行性。两者都parallel
实现foreach
分布式内存并行。我知道一个实现共享内存 ( Rdsm
) 的 R 包,但我没有使用它。获得共享内存并行性的更自然的方法是使用 C++。
我已经在 R(串行)、带有 Rcpp(串行)的 C++ 和 C++ 以及带有 Rcpp 和 RcppParallel(并行)的 OpenMP 中实现了波段到矩阵的转换。请注意,我使用的输入是一个没有重复对角线的向量列表。对于 OpenMP 解决方案,我将其转换为 (ragged) matrix
,因为这样可以轻松转换为线程安全的RMatrix
. 即使在 R 中,这种转换也非常快:
#include <Rcpp.h>
#include <algorithm>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix diags2mtrCpp(int n, const ListOf<const NumericVector>& diags) {
NumericMatrix mtr(n, n);
int nDiags = diags.size();
for (int i = 0; i < nDiags; ++i) {
NumericVector diag(diags[i]);
int nDiag = diag.size();
int row = std::max(1, i - n + 2);
int col = std::max(1, n - i);
for (int j = 0; j < nDiag; ++j) {
mtr(row + j - 1, col + j - 1) = diag(j);
}
}
return mtr;
}
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
using namespace RcppParallel;
// [[Rcpp::export]]
NumericMatrix diags2mtrOmp(const NumericMatrix& diags_matrix, const IntegerVector& diags_length) {
int nDiags = diags_matrix.cols();
int n = diags_matrix.rows();
NumericMatrix res(n, n);
RMatrix<double> mtr(res);
RMatrix<double> diags(diags_matrix);
RVector<int> diagSize(diags_length);
#pragma omp parallel for
for (int i = 0; i < nDiags; ++i) {
int nDiag = diagSize[i];
int row = std::max(1, i - n + 2);
int col = std::max(1, n - i);
for (int j = 0; j < nDiag; ++j) {
mtr(row + j - 1, col + j - 1) = diags(j, i);
}
}
return res;
}
/*** R
set.seed(42)
n <- 2^12
n
diags <- vector(mode = "list", length = 2 * n - 1)
for (i in seq_len(n)) {
diags[[i]] <- rep.int(runif(1), i)
diags[[2 * n - i]] <- rep.int(runif(1), i)
}
diags_matrix <- matrix(0, nrow = n, ncol = length(diags))
diags_length <- integer(length(diags))
for (i in seq_along(diags)) {
diags_length[i] <- length(diags[[i]])
diags_matrix[ ,i] <- c(diags[[i]], rep.int(0, n - diags_length[i]))
}
diags2mtr <- function(n, diags) {
mtr <- matrix(0, n, n)
for (i in seq_along(diags)) {
row <- max(1, i - n + 1)
col <- max(1, n + 1 - i)
for (j in seq_along(diags[[i]]))
mtr[row + j - 1 , col + j - 1] <- diags[[i]][j]
}
mtr
}
system.time(mtr <- diags2mtr(n, diags))
system.time(mtrCpp <- diags2mtrCpp(n, diags))
system.time(mtrOmp <- diags2mtrOmp(diags_matrix, diags_length))
all.equal(mtr, mtrCpp)
all.equal(mtr, mtrOmp)
*/
在双核机器上对这些解决方案进行基准测试给了我:
Unit: milliseconds
expr min lq mean median uq max neval
diags2mtr 2252.82538 2271.7221 2354.1251 2323.8221 2382.7958 2558.9282 10
diags2mtrCpp 161.25920 190.9728 224.9094 226.2652 265.3675 279.3848 10
diags2mtrOmp 95.50714 100.9555 105.8462 102.4064 105.7645 127.5200 10
我对diags2mtrOmp
.