cuda - 使用 CUDA 的矩阵的多个 SVD

Question

我正在尝试使用下面给出的代码进行 SVD，但是我没有得到 d_S 的正确结果，当 num_Matrices > 128 时，比如说 256，当 num_Matrices = 128*128 时我得到正确的结果，我指的是基于对堆栈溢出问题的回答，链接是Parallel implementation for multiple SVDs using CUDA。

下面给出了我的代码，它给出了前 26 个矩阵的奇异值作为全 0。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define FULLSVD
#define PRINTRESULTS

#define M_SQR(x)         ((x)*(x))

__global__ void initSIGPU(cuComplex *SI, float *SDiag, int N, int M, int len, float epsilon);
void initSI(cuComplex *SI, float *SDiag, int N, int M, int len, float epsilon, int threadsPerBlock);
__global__ void initCZeros(cuComplex *in, int len);
void checkArray(cuComplex *ptr, int rows, int cols, const char *name);
void printArray(cuComplex *ptr, int rows, int cols, char mode, const char *name);
cuComplex complexMul(cuComplex a, cuComplex b);
double verifyDecomposition(cuComplex *U, float *S, cuComplex *V, cuComplex *A, int M, int N);
void GPU_Multi(cuComplex **M, cuComplex **N, cuComplex **P,
               size_t pr, size_t pc, size_t mc,
               size_t num_mat, cuComplex alpha, cuComplex beta);

/********/
/* MAIN */
/********/
int main() {

    const int           M = 5;
    const int           N = 3;
    const int           K = 2;
    int                 lda = M;
    const int         numMatrices = 128*3;
    //const int           numMatrices = 256;
    cublasHandle_t cublasHandle = NULL;

    /* residual and executed_sweeps are not supported on gesvdjBatched */
    //double residual = 0;
    int executed_sweeps = 0;

    TimingGPU timerGPU;
    cudaEvent_t start, stop;
    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;

    cublasCreate(&cublasHandle);
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int major = -1, minor = -1, patch = -1;
    cusolverGetProperty(MAJOR_VERSION, &major);
    cusolverGetProperty(MINOR_VERSION, &minor);
    cusolverGetProperty(PATCH_LEVEL, &patch);
    printf("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n",
           major, minor, patch);

    // --- Setting the host matrix
    cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(cuComplex));
    for (unsigned int k = 0; k < numMatrices; k++)
        for (unsigned int i = 0; i < M; i++) {
            for (unsigned int j = 0; j < N; j++) {
                h_A[k * M * N + j * M + i] = make_float2((rand() * 1.0 + k*0.1)/ RAND_MAX, (rand() * 1.0 + k*0.1) / RAND_MAX); //make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
                //printf("[%d, %d] %f\n", i, j, h_A[j*M + i]);
                //printf("%f %f ", h_A[j * M + i].x, h_A[j * M + i].y);
            }
            //printf("\n");
        }

    printArray(h_A, M, N, 'T', "h_A1");
    printArray(h_A + M * N, M, N, 'T', "h_A2");

    cuComplex *d_b = NULL;
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(cuComplex) * M * K * numMatrices));

    cuComplex *h_b = (cuComplex *)malloc(sizeof(cuComplex) * M * K * numMatrices);

    for (int k = 0; k < numMatrices; k++)
    {
        for (int i = 0; i < M; i++)
        {
            for (int j = 0; j < K; j++)
            {
                //h_b[i] = make_float2((i + 1) * 1.0, (i + 1) * 1.0);

                if ((i == 0) || (i == 1) || (i == 2) || (i == 3) || (i == 4))
                    h_b[k * M * K + i * K + j] = make_float2(1.0, 0.0);
                else
                    h_b[k * M * K + i * K + j] = make_float2(2.0, 0.0);
            }
        }
    }

    gpuErrchk(cudaMemcpy(d_b, h_b, sizeof(cuComplex) * M * K * numMatrices, cudaMemcpyHostToDevice));

    printArray(h_b, M, K, 'N', "h_b1");
    printArray(h_b + M * K, M, K, 'N', "h_b2");

    // --- Setting the device matrix and moving the host matrix to the device
    cuComplex *d_A;         gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
    cuComplex *h_SI = (cuComplex *)malloc(N * M * numMatrices * sizeof(cuComplex));
    cuComplex *h_x = (cuComplex *)malloc(N * numMatrices * sizeof(cuComplex));

    cuComplex *h_U = NULL;
    cuComplex *h_V = NULL;
#ifdef FULLSVD
    h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
    h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;       gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    float *d_S;         gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
    cuComplex *d_SI;    gpuErrchk(cudaMalloc(&d_SI, N * M * numMatrices * sizeof(cuComplex)));

    cuComplex *d_x;     gpuErrchk(cudaMalloc(&d_x, N * numMatrices * sizeof(cuComplex)));
    cuComplex *d_U = NULL;
    cuComplex *d_V = NULL;
#ifdef FULLSVD
    gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif

    cuComplex *d_work = NULL; /* device workspace for gesvdj */
    int devInfo_h = 0; /* host copy of error devInfo_h */

    // --- Parameters configuration of Jacobi-based SVD
    const double            tol = 1.e-7;
    const int               maxSweeps = 100;
    cusolverEigMode_t jobz;                                   // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
    jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
    jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif

    const int               econ = 0;                            // --- econ = 1 for economy size 

    // --- Numerical result parameters of gesvdj 
    double                  residual = 0;
    int                     executedSweeps = 0;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle = NULL;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- Configuration of gesvdj
    gesvdjInfo_t gesvdj_params = NULL;
    cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));

    // --- Set the computation tolerance, since the default tolerance is machine precision
    cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));

    // --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
    cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));

    // --- Query the SVD workspace 
    cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M,N) if econ = 1
        lda,                                        // --- Leading dimension of V, ldv >= max(1, N)
        &work_size,
        gesvdj_params,
        numMatrices));

    gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));

    // --- Compute SVD
    timerGPU.StartCounter();
    cusolveSafeCall(cusolverDnCgesvdjBatched(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M, N) if econ = 1
        N,                                          // --- Leading dimension of V, ldv >= max(1, N)
        d_work,
        work_size,
        devInfo,
        gesvdj_params,
        numMatrices));

    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);
    printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());

    /*
    * The folowing two functions do not support batched version.
    * The error CUSOLVER_STATUS_NOT_SUPPORTED is returned.
    */
    status = cusolverDnXgesvdjGetSweeps(
        solver_handle,
        gesvdj_params,
        &executed_sweeps);
    assert(CUSOLVER_STATUS_NOT_SUPPORTED == status);
    status = cusolverDnXgesvdjGetResidual(
        solver_handle,
        gesvdj_params,
        &residual);
    assert(CUSOLVER_STATUS_NOT_SUPPORTED == status);

    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(float) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
    gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(cuComplex) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(cuComplex) * N * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
}

cuda - 使用 CUDA 的矩阵的多个 SVD

0 回答 0

Related

Reference