1

根据我对 PyCUDA 文档、示例和 Kirk 和 Hwu 的 CUDA 书籍的阅读,我已经成功实现了一个基于 CUDA C 的复矩阵乘法程序,并且还用 PyCUDA 编写了一个版本。C 代码会产生正确的结果,但 Python 代码不会。

需要明确的是,Python 代码仅取自样本 (MatrixMulTiled),并已修改为使用“cuComplex.h”中的 cuComplexFloat 处理复数。在此修改之前,它正确地乘以实值矩阵。

所以我无法弄清楚错误。Python代码是

# attempt to do matrix multiplication for complex numbers
import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools    
import numpy as np
from time import *

kernel_code_template = """
            #include <cuComplex.h>
    __global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *C)
    {
          const uint wA = %(MATRIX_SIZE)s;
          const uint wB = %(MATRIX_SIZE)s;

          // Block index
          const uint bx = blockIdx.x;
          const uint by = blockIdx.y;

          // Thread index
          const uint tx = threadIdx.x;
          const uint ty = threadIdx.y;

          // Index of the first sub-matrix of A processed by the block
          const uint aBegin = wA * %(BLOCK_SIZE)s * by;
          // Index of the last sub-matrix of A processed by the block
          const uint aEnd   = aBegin + wA - 1;
          // Step size used to iterate through the sub-matrices of A
          const uint aStep = %(BLOCK_SIZE)s;

          // Index of the first sub-matrix of B processed by the block
          const int bBegin = %(BLOCK_SIZE)s * bx;
          // Step size used to iterate through the sub-matrcies of B
          const uint bStep = %(BLOCK_SIZE)s * wB;

          // The element of the block sub-matrix that is computed by the thread
          cuFloatComplex Csub = make_cuFloatComplex(0,0);
          // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
          for (int a = aBegin, b = bBegin;
               a <= aEnd;
           a += aStep, b += bStep)
          {
               // Shared memory for the sub-matrix of A
           __shared__ cuFloatComplex As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
           // Shared memory for the sub-matrix of B
           __shared__ cuFloatComplex Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

           // Load the matrices from global memory to shared memory;
           // each thread loads one element of each matrix
           As[ty][tx] = make_cuFloatComplex(cuCrealf(A[a + wA*ty + tx]),cuCimagf(A[a + wA*ty + tx]));
           Bs[ty][tx] = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wA*ty + tx]));

           // Synchronize to make sure the matrices are loaded
           __syncthreads();

           // Multiply the two matrcies together
           // each thread computes one element of the block sub-matrix
           for(int k = 0; k < %(BLOCK_SIZE)s; ++k)
           {
                Csub = cuCaddf(Csub,cuCmulf(As[ty][k],Bs[k][tx]));
           } 

           // Synchronize to make sure that the preceding computation
           // is done before loading two new sub-matrices of A and B in the next iteration
           __syncthreads();
         }

         // Write the block sub-matrix to global memory
         // each thread writes one element
         const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
         C[c + wB*ty + tx] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
    }
    """

MATRIX_SIZE = 4
TILE_SIZE  = 2
BLOCK_SIZE = TILE_SIZE

a_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex)
b_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex)

a_cpu[:,:] = 1 + 1j*0
b_cpu[:,:] = 1 + 1j*2

# compute reference on the CPU to verify GPU computation
t1 = time()
c_cpu = np.dot(a_cpu, b_cpu)
t2 = time()
t_cpu = t2-t1

# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)

# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex)

# get the kernel code from the template
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {
        'MATRIX_SIZE': MATRIX_SIZE,
        'BLOCK_SIZE': BLOCK_SIZE,
        }

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")

# call the kernel on the card
t1 = time()
matrixmul(
        # inputs
    a_gpu, b_gpu,
    # output
    c_gpu,
    # grid of multiple blocks
    grid = (MATRIX_SIZE/TILE_SIZE, MATRIX_SIZE/TILE_SIZE),
    # block of multiple threads
    block = (TILE_SIZE, TILE_SIZE, 1),
    )
t2 = time()
t_gpu = t2-t1

# print the results
print("-" * 80)
print("Matrix A (GPU): ")
print(a_gpu.get())

print("-" * 80)
print("Matrix B (GPU): ")
print(b_gpu.get())

print("-" * 80)
print("Matrix C (GPU): ")
print(c_gpu.get())

print("-" * 80)
print("Matrix C (CPU): ")
print(c_cpu)

print("-" * 80)
print("CPU-GPU Difference: ")
print(c_cpu-c_gpu.get())

print("CPU Time ", t_cpu)
print("GPU Time ", t_gpu)

np.allclose(c_cpu, c_gpu.get() )            

C代码是

 #include <stdio.h>
 #include <stdlib.h>
 #include <cuComplex.h>

/* __global__ void MatrixMulKernel(...)
 * This is the main Matrix Multiplication Kernel
 * It is executed on the device (GPU).
 */
__global__ void MatrixMulKernel(cuFloatComplex *Md, cuFloatComplex *Nd, cuFloatComplex *Pd, int Width, int TILE_WIDTH)
{
    // Calculate the row index of the Pd element and M
    int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
    // Calculate the column index of the Pd element and N
    int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;

    cuFloatComplex Pvalue = make_cuFloatComplex(0,0);
    // each thread computes one element of the block sub-matrix
    for(int k = 0; k < Width; k++)
    {
        Pvalue = cuCaddf(Pvalue,cuCmulf(Md[Row*Width + k],Nd[k*Width + Col]));
    }
    Pd[Row*Width + Col] = make_cuFloatComplex(cuCrealf(Pvalue),cuCimagf(Pvalue));
}    

/* void MatrixMultiplication(...)
 * This is the stub function for the matrix multiplication kernel
 * It is executed on the host. It takes inputs from the main() function
 * and declares memory, copies data to the device, invokes the kernel
 * and copies the result from the device back to the host.
 */
void MatrixMultiplication(cuFloatComplex *M, cuFloatComplex *N, cuFloatComplex *P, int Width, int TILE_WIDTH)
{
    int size = Width*Width*sizeof(cuFloatComplex);
    cuFloatComplex *Md, *Nd, *Pd;

    // Transfer M and N to device memory
    cudaMalloc((void**) &Md, size);
    cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);
    cudaMalloc((void**) &Nd, size);
    cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);

    // allocate P on the device
    cudaMalloc((void**) &Pd, size);

    // setup the execution configuration
    dim3 dimGrid(Width/TILE_WIDTH, Width/TILE_WIDTH);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    // Launch the device computation kernel
    MatrixMulKernel<<<dimGrid,dimBlock>>>(Md, Nd, Pd, Width, TILE_WIDTH);

    // Transfer P from device to host
    cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);

    // Free device matrices
    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
}    

/* void printMatrix(..)
 * This is a function used to print the matrix
 */
void printMatrix(cuFloatComplex *M, int Width)
{
    for(int i = 0; i < Width; i++)
    {
    for(int j = 0; j < Width; j++)
        {
        printf("  %f+i%f", cuCrealf(M[i*Width + j]), cuCimagf(M[i*Width + j]));
    }
        printf("\n");
}
}

int main()
{
/* Define dimension of matrix (Width x Width) */
    int Width = 4;
/* Define dimension of tile
     * This should be less than 512
     */
    int TILE_WIDTH = 2;
    /* Define pointers for row major matrices */
    cuFloatComplex *M, *N, *P;

    M = (cuFloatComplex *)malloc(Width*Width*sizeof(cuFloatComplex));
    N = (cuFloatComplex *)malloc(Width*Width*sizeof(cuFloatComplex));
    P = (cuFloatComplex *)malloc(Width*Width*sizeof(cuFloatComplex));

    /* Fill matrices arbitrarily */
    for(int i = 0; i < Width; i++)
    {
        for(int j = 0; j < Width; j++)
        {
            M[i*Width + j] = make_cuFloatComplex(1,0);
            N[i*Width + j] = make_cuFloatComplex(1,2);
            P[i*Width + j] = make_cuFloatComplex(0,0);
        }
    }

    /* code to print matrices using helper function */
    printf("Matrix M is \n\n");
    printMatrix(M, Width);
    printf("\nMatrix N is \n\n");
    printMatrix(N, Width);

    /* Call the stub function for matrix multiplication */
    MatrixMultiplication(M, N, P, Width, TILE_WIDTH);

    printf("\nMatrix P is \n\n");
    printMatrix(P, Width);

    free(M);
    free(N);
    free(P);

    return 0;
}    

Python代码的输出是

Matrix C (GPU): 
[[  1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j
    1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j]    
 [  1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j
    1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j]
 [ -9.01080877e+306 -5.19870527e+306j  -1.45379609e+307 -8.65694841e+306j
   -4.14125486e+306 -2.15325816e+306j  -5.83708063e+306 -3.25935506e+306j]
 [ -1.44828853e+306 -1.44828853e+306j  -2.32949855e+306 -2.32949855e+306j
   -3.78945180e+306 -3.78945180e+306j  -6.54203686e+306 -6.54203686e+306j]]
--------------------------------------------------------------------------------
Matrix C (CPU): 
[[ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]
 [ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]
 [ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]
 [ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]]

C 输出为

Matrix P is 

  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000
  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000
  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000
  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000

如果有人能指出我的 Python 代码中的错误,我将不胜感激。我正在努力赶上论文的最后期限,而我的所有其余代码都在 Python 中,所以我没有时间将它移植到 C 中。

谢谢!

=======================

编辑:问题已解决

这很可能是一个精度问题。我通过用以下替换主机传输和空矩阵创建代码来修复它......

    # transfer host (CPU) memory to device (GPU) memory
    a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))
    b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))

    # create empty gpuarry for the result (C = A * B)
    c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex64)

希望这会有所帮助。

4

1 回答 1

1

这很可能是一个精度问题。我通过用以下替换主机传输和空矩阵创建代码来修复它......

# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))
b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))

# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex64)

但我确实发现使用 PyCUDA 乘以大型复数值矩阵时存在问题。例如,采用 MATRIX_SIZE = 5040 和 TILE_SIZE = 16(从硬件的角度来看这可能不是一个好的选择),CUDA-C 确实设法将矩阵相乘,但 PyCUDA 崩溃了。为什么会发生这种情况?

于 2013-08-01T02:49:48.230 回答