r - Rcpp 和 CULA：分段错误

Question

我从gputools R 包中提取了相关位，通过动态加载链接到culatools的共享库，使用Rcpp在我的 GPU 上运行 QR 分解。一切都在我的 Mac 上的终端和R.app中顺利运行。结果与R的qr()函数一致，但问题是退出R.app时出现分段错误（使用终端时不会出现该错误）：

*** caught segfault ***
address 0x10911b050, cause 'memory not mapped'

我想我将问题缩小到链接到culatools的 .c 文件中的指针“a”和“tau” ：

#include<cula.h>

void gpuQR(const int *m, const int *n, float *a, const int *lda, float *tau)
{
    culaInitialize();   
    culaSgeqrf(m[0], n[0], a, lda[0], tau);
    culaShutdown();
}

我在我的 Mac 上编译了 .c 文件，使用：

/usr/local/cuda/bin/nvcc -gencode arch=compute_10,code=sm_10 -gencode arch=compute_11,code=sm_11 -gencode arch=compute_12,code=sm_12 -gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -c -I. -I/usr/local/cula/include -m64 -Xcompiler -fPIC gpuQR.c -o gpuQR.o
/usr/local/cuda/bin/nvcc -gencode arch=compute_10,code=sm_10 -gencode arch=compute_11,code=sm_11 -gencode arch=compute_12,code=sm_12 -gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -shared -m64 -Xlinker -rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_core -lcula_lapack -lcublas -o gpuQR.so gpuQR.o

我写了一个 .cpp 文件，它使用Rcpp并动态加载共享库 gpuQR.so：

#include <Rcpp.h>
#include <dlfcn.h>

using namespace Rcpp;
using namespace std;

typedef void (*func)(int*, int*, float*, int*, float*);

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);
    vector<float> scale(n_cols);

    void* lib_handle = dlopen("path/gpuQR.so", RTLD_LAZY);
    if (!lib_handle) 
    { 
        Rcout << dlerror() << endl; 
    } else {
        func gpuQR = (func) dlsym(lib_handle, "gpuQR");  
        gpuQR(&n_rows, &n_cols, &(x[0]), &n_rows, &(scale[0]));
    }

    dlclose(lib_handle);

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { y[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

我使用以下方法在R中编译了 .cpp 文件：

library(Rcpp)  
PKG_LIBS <- sprintf('%s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('%s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_Rcpp.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

并运行了一个例子：

dyn.load('path/gpuQR_Rcpp.so')

set.seed(100)
A <- matrix(rnorm(9), 3, 3)
n_row <- nrow(A)
n_col <- ncol(A)

res <- .Call('gpuQR_Rcpp', c(A), n_row, n_col)
matrix(res, n_row, n_col)

           [,1]       [,2]       [,3]
[1,]  0.5250958 -0.8666927  0.8594266
[2,] -0.2504899 -0.3878644 -0.1277837
[3,]  0.1502908  0.4742033 -0.8804248

qr(A)$qr

          [,1]       [,2]       [,3]
[1,]  0.5250957 -0.8666925  0.8594266
[2,] -0.2504899 -0.3878643 -0.1277838
[3,]  0.1502909  0.4742033 -0.8804247

有人知道如何解决分段错误吗？

score 1 · Accepted Answer

实际上不需要动态加载链接到culatools的共享库。我最初考虑过这个，但是我没有使用Rcpp编译得到 .cpp 文件。无论如何，新的 .cpp 文件是：

#include<Rcpp.h>
#include<cula.h>

using namespace Rcpp;
using namespace std;

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);  
    vector<float> scale(n_cols);

    culaInitialize();   
    culaSgeqrf(n_rows, n_cols, &(x[0]), n_rows, &(scale[0]));
    culaShutdown();

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { x[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

.cpp 文件使用以下命令编译：

library(Rcpp)  
PKG_LIBS <- sprintf('-Wl,-rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_lapack %s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('-I/usr/local/cula/include %s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_inc.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

我在其中设置了culatools的适当路径。整个事情并没有运行得更快，但是不再需要编译链接到culatools的共享库并动态加载它。

我认为这是gputools R 包的一个很好的替代方案，可以用C++扩展R并在 GPU 上执行线性代数运算。

score 1 · Accepted Answer

问题通过删除解决

dlclose(lib_handle);

来自 .cpp 文件。这会产生以下结果：

#include <Rcpp.h>
#include <dlfcn.h>

using namespace Rcpp;
using namespace std;

typedef void (*func)(int*, int*, float*, int*, float*);

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);
    vector<float> scale(n_cols);

    void* lib_handle = dlopen("path/gpuQR.so", RTLD_LAZY);
    if (!lib_handle) 
    { 
        Rcout << dlerror() << endl; 
    } else {
        func gpuQR = (func) dlsym(lib_handle, "gpuQR");  
        gpuQR(&n_rows, &n_cols, &(x[0]), &n_rows, &(scale[0]));
    }

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { x[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

.cpp 文件可以使用R编译：

library(Rcpp)  
PKG_LIBS <- sprintf('%s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('%s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_Rcpp.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

链接到culatools的实际 .c 文件是：

#include<cula.h>

void gpuQR(const int *m, const int *n, float *a, const int *lda, float *tau)
{
    culaInitialize();   
    culaSgeqrf(m[0], n[0], a, lda[0], tau);
    culaShutdown();
}

可以使用以下方式编译：

gcc -c -I/usr/local/cula/include gpuQR.c
gcc -shared -Wl,-rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_lapack -o gpuQR.so gpuQR.o

然后可以使用 R 在R中执行 QR 分解：

dyn.load('path/gpuQR_Rcpp.so')

set.seed(100)

n_row <- 3
n_col <- 3
A <- matrix(rnorm(n_row * n_col), n_row, n_col)

res <- .Call('gpuQR_Rcpp', c(A), n_row, n_col)
matrix(res, n_row, n_col)

           [,1]       [,2]       [,3]
[1,]  0.5250958 -0.8666927  0.8594266
[2,] -0.2504899 -0.3878644 -0.1277837
[3,]  0.1502908  0.4742033 -0.8804248

qr(A)$qr

          [,1]       [,2]       [,3]
[1,]  0.5250957 -0.8666925  0.8594266
[2,] -0.2504899 -0.3878643 -0.1277838
[3,]  0.1502909  0.4742033 -0.8804247

以下是使用具有 16 个 CUDA 内核的 NVIDIA GeForce 9400M GPU 的基准测试结果：

n_row <- 1000; n_col <- 1000
A <- matrix(rnorm(n_row * n_col), n_row, n_col)
B <- A; dim(B) <- NULL

res <- benchmark(.Call('gpuQR_Rcpp', B, n_row, n_col), qr(A), columns = c('test', 'replications', 'elapsed', 'relative'), order = 'relative')

                                  test replications elapsed relative
1 .Call("gpuQR_Rcpp", B, n_row, n_col)          100  38.037    1.000
2                                qr(A)          100 152.575    4.011

r - Rcpp 和 CULA：分段错误

2 回答 2

Related

Reference