2

I can't run cublasStrsmBatched (line 113) without CUBLAS_STATUS_EXECUTION_FAILED (13) output. To simplify, all matrix values and alpha are 1.0, all matrices are square and lda, ldb, m and n are equal. I am able to run cublasSgemmBatched and cublasStrsm in the same way, with no error. cublasStrsmBatched should be the same, but it is not, not for me. Please tell me if you have any idea about what am I doing wrong in this code:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

cublasHandle_t handle;

void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);

void TestCublasStrsmBatched(int size, int numOfLinSys);

int main()
{
    cublasStatus_t status = cublasCreate(&handle);
    CheckCublasCreate(status);

    /*arguments are size of square matrix 
    and number of linear systems*/
    TestCublasStrsmBatched(2,2);

    status = cublasDestroy(handle);
    CheckCublasDestroy(status);
}

void TestCublasStrsmBatched(int size, int numOfLinSys)
{
    cublasStatus_t status;
    cudaError_t error;
    float **h_A;
    float **d_A;
    float **h_B;
    float **d_B;
    float **hd_A;
    float **hd_B;
    float *alpha;

    const int n = size;
    const int m = size;
    const int lda=m;
    const int ldb=m;
    const int matA_numOfElem = m*m;
    const int matB_numOfElem = m*n;

    int i,j;

    h_A = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(h_A);

    h_B = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(h_B);

    alpha=(float *)malloc(sizeof(float));
    *alpha = 1.0;

    for (j=0; j<numOfLinSys; j++){
        h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
        CheckAllocateHost(h_A);
        for (i=0; i < matA_numOfElem; i++) 
            h_A[j][i] = 1.0;

        h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
        CheckAllocateHost(h_B);
        for (i=0; i < matB_numOfElem; i++)
            h_B[j][i] = 1.0;
        }

    hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(hd_A);

    hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(hd_B);

    for (j=0; j<numOfLinSys; j++){
        error = cudaMalloc((void **)&hd_A[j], 
                           matA_numOfElem * sizeof(float));
        CheckCudaMalloc(error);

        error = cudaMalloc((void **)&hd_B[j], 
                           matB_numOfElem * sizeof(float));
        CheckCudaMalloc(error);

        status = cublasSetMatrix(m, m, sizeof(float), 
                                 h_A[j], lda, hd_A[j], lda);
        CheckCublasSetGetMatrix(status);

        status = cublasSetMatrix(m, n, sizeof(float), 
                                 h_B[j], ldb, hd_B[j], ldb);
        CheckCublasSetGetMatrix(status);
        }

    error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
    CheckCudaMalloc(error);

    error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
    CheckCudaMalloc(error);

    error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*), 
                       cudaMemcpyHostToDevice);
    CheckCudaMemcpy(error);

    error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*), 
                       cudaMemcpyHostToDevice);
    CheckCudaMemcpy(error);

    /*After cublasStrsmBatched call 
    status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
    status = cublasStrsmBatched(handle,
                                CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
                                CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
                                m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
    CheckKernelExecution(status);
}


void CheckCublasCreate( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, 
                "!!!! CUBLAS initialization error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckAllocateHost( void* h_pointer )
{
    if (h_pointer == 0){
        fprintf(stderr, 
                "!!!! host memory allocation error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckCudaMalloc( cudaError_t error )
{
    if (error != cudaSuccess){
        fprintf(stderr, 
                "!!!! device memory allocation error (error code %s)\n", 
                cudaGetErrorString(error));
        exit(EXIT_FAILURE);
        }
}

void CheckCudaMemcpy( cudaError_t error )
{
    if (error != cudaSuccess){
        fprintf(stderr, "!!!! data copy error (error code %s)\n", 
                cudaGetErrorString(error));
        exit(EXIT_FAILURE);
        }
}

void CheckCublasSetGetMatrix( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! device access error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckKernelExecution( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! kernel execution error.\n");
        exit(EXIT_FAILURE);
        }
}

void CheckCublasDestroy( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! shutdown error \n");
        exit(EXIT_FAILURE);
        }
}

Using Linux, CUDA 5.5, T10 and Windows, CUDA 5.5, GTX285

Thanks!

4

1 回答 1

2

批处理的三角形反求解器是我以前在 CUBLAS 中没有尝试过的,所以我有兴趣看看会发生什么。您的代码相当复杂,所以我没有费心去理解它,但是当我运行它时,它似乎因内部 CUBLAS 启动失败而失败:

$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
=========     Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
=========     Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
=========     Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
=========     Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
=========     Host Frame:[0x1]

(这是一台具有计算 1.2 GPU 和 CUDA 5.0 的 OS X 机器)。错误 8 是cudaErrorInvalidDeviceFunction,通常仅在库或 fatbinary 没有匹配或无法 JIT 重新编译为您的 GPU 可以运行的架构的架构时出现。

很感兴趣,我从头开始编写了自己更简单的复制案例:

#include <iostream>
#include <cublas_v2.h>

int main(void)
{
    const int Neq = 5, Nrhs = 2, Nsys = 4;

    float Atri[Neq][Neq] = 
        { { 1,  6, 11, 16, 21},
        { 0,  7, 12, 17, 22},
        { 0,  0, 13, 18, 23},
        { 0,  0,  0, 19, 24},
        { 0,  0,  0,  0, 25} };

    float B[Nrhs][Neq] = 
        { {  1,  27, 112, 290, 595},
        {  2,  40, 148, 360, 710} };


    float *syslhs[Nsys], *sysrhs[Nsys];
    float *A_, *B_, **syslhs_, **sysrhs_;

    size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
    size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);

    cudaMalloc((void **)(&A_), Asz);
    cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));

    cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
    for(int i=0; i<Nsys; i++) {
        syslhs[i] = A_;
        sysrhs[i] = (float*)((char *)B_ + i*Bsz);
        cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
    }

    size_t syssz = sizeof(float *) * (size_t)Nsys;
    cudaMalloc((void **)&syslhs_, syssz);
    cudaMalloc((void **)&sysrhs_, syssz);
    cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
    cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);

    const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
    const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
    const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
    const cublasOperation_t trans = CUBLAS_OP_N;
    float alpha = 1.f;

    cublasHandle_t handle;
    cublasCreate(&handle);

    cublasStrsmBatched(
                handle,
                side, ulo, trans, diag,
                Neq, Nrhs,
                &alpha, 
                syslhs_, Neq,
                sysrhs_, Neq,
                Nsys
                );


    for(int k=0; k<Nsys; k++) {
        cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
        for(int i=0; i<Nrhs; i++) {
            for(int j=0; j<Neq; j++) {
                std::cout << B[i][j] << ",";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

    return 0;
} 

这也以与您的代码相同的方式失败。乍一看,这确实是 CUBLAS 内部的问题,虽然很难说是什么问题。我能想到的唯一一件事是,这些求解器仅在计算能力 3.5 设备上受支持,而在计算能力1.x 设备上不支持,但文档没有提及。在我们之间,我们测试了计算 1.2、计算 1.3 和计算 3.0 [我的错误,我在你的问题中读到的是 K10 而不是 T10] 设备,所以剩下的不多了.....

我只能建议尝试使用 cuda-memcheck 运行您的代码,看看它是否报告了相同的错误。如果是这样,我会在您的未来看到向 NVIDIA 提交的错误报告。


编辑:我公然无视 EULA 并使用 cuobjdump 来探索 CUDA 5 cublas 库中的 cubin 有效负载。对于单精度批处理 trsm 例程,我找到了 cubins

  • 32 位 sm_20
  • 32 位 sm_30
  • 32 位 sm_35
  • 64 位 sm_20
  • 64 位 sm_30
  • 64 位 sm_35

库中显然没有 sm_1x cubin,因此我的 compute_12 设备应该会产生我看到的运行时库错误。它还解释了 GTX 285 和 Telsa T10 的错误,它们都是 compute_13。


编辑2:

正如怀疑的那样,我的复制代码在 CUDA 5.0 和 CUDA 5.5 发行库下的带有 compute_30 设备的 linux 系统上完美运行。

于 2013-11-09T16:53:59.947 回答