parallel-processing - OpenCL 矩阵乘法入队/缓冲区读取

Question

我正在 OpenCL 中尝试一个基本的矩阵乘法程序。我相信我的问题在于我的队列和/或缓冲区读取，因为结果矩阵的输出完全不正确，矩阵 A 和 B 的第一行也不正确。我是 OpenCL 新手，我一直在努力我反对这个已经有一段时间了，也许这里有人可以给我一个提示我哪里出错了？

主机代码：

#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>

/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB

using namespace std;

/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,  
      int rowSize);

//*************
// Main Program
//*************
int main(int argc, char* argv[]) {

    /* Check for valid matrix sizes */
    if (numColsA != numRowsB) {
        cout << "ERROR: Invalid matrix dimensions." << endl;
    } else {

    srand(2013); // Set random seed

    /* Allocate memory for matrices A, B, and C */
    unsigned int sizeA = numRowsA * numColsA;
    unsigned int sizeB = numRowsB * numColsB;
    unsigned int sizeC = numRowsC * numColsC;
    unsigned int memoryA = sizeof(float) * sizeA;
    unsigned int memoryB = sizeof(float) * sizeB;
    unsigned int memoryC = sizeof(float) * sizeC;

    /*
        Allocate memoryA/memoryB/memoryC size blocks of bytes
        (cast from void*)
    */
    float* blockA = (float*) malloc(memoryA);
    float* blockB = (float*) malloc(memoryB);
    float* blockC = (float*) malloc(memoryC);

    /* Initialize matrices A and B */
    initMatrix(blockA, sizeA);
    initMatrix(blockB, sizeB);

    /* Display matrices A and B */
    printMatrix("Matrix A", blockA, sizeA, numColsA);
    printMatrix("Matrix B", blockB, sizeB, numColsB);

    cl_int err;            // Error code
    string platformVendor; // Platform vendor

    /* Create list of platforms */
    cl::vector < cl::Platform > platformList;
    cl::Platform::get(&platformList);

    /*
        Display potential Platform list generation error. If the
        platform list size does not equal 0, CL_SUCCESS (0) is
        sent to the function. If the platform list size does
        equal 0, -1 is sent to the function.
    */
    checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
            "Platform");

    /*
        Replace empty value of platformVendor with device vendor
        name
    */
    platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
        &platformVendor);

    /* Properties for Context constructor (Use unknown) */
    cl_context_properties cprops[3] =
        {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties) (platformList[0]) (),
        0
        };

    /* Create context */
    cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
        &err);

    /* Display potential Context constructor error */
    checkErr(err, "Context");

    /* Create buffer for matrix A */
    cl::Buffer deviceMemA(context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);

    /* Create buffer for matrix B */
    cl::Buffer deviceMemB(context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);

    /* Create buffer for matrix C */
    cl::Buffer deviceMemC(context,
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);

    /* Create buffer for row (A) and col (C) */
    cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
            (void *) numRowsA, &err);
    cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
            (void *) numColsC, &err);

    /* Display potential Buffer constructor error */
    checkErr(err, "Buffers");

    /* Get list of devices */
    cl::vector<cl::Device> devices =
        context.getInfo<CL_CONTEXT_DEVICES>();

    /* Check for at least one device, if not throw error */
    checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");

    /* Read input from .cl file */
    ifstream file("matrixMult1_kernels.cl");

    /* Check for potential problem opening .cl input file */
    checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");

    /* Store file contents in a string */
    string prog(istreambuf_iterator<char>(file),
            (istreambuf_iterator<char>()));

    /* Create source object */
    cl::Program::Sources source(1, make_pair(prog.c_str(),
        prog.length()+1));

    /* Create program for given context and source */
    cl::Program program(context, source);

    err = program.build(devices, ""); // Check for build error

    /* Display potential program build error */
    checkErr(err, "Program Build");

    /* Create kernel */
    cl::Kernel kernel(program, "matrixMul", &err);

    /* Display potential Kernel constructor error */
    checkErr(err, "Kernel");

    /*
        Set matrixMul arguments, error checking after each
        argument
    */
    err = kernel.setArg(0, deviceMemA);
    checkErr(err, "Arg0");
    err = kernel.setArg(1, deviceMemB);
    checkErr(err, "Arg1");
    err = kernel.setArg(2, deviceMemC);
    checkErr(err, "Arg2");
    err = kernel.setArg(3, rowA);
    checkErr(err, "Arg3");
    err = kernel.setArg(4, colC);
    checkErr(err, "Arg4");

    /* Create command queue */
    cl::CommandQueue queue(context, devices[0], 0, &err);

    /* Display potential CommandQueue constructor error */
    checkErr(err, "Command Queue");

    /* Create event object */
    cl::Event event;

    cl::NDRange global(3, 3);
    cl::NDRange local(1, 1);

    /* Enqueue the kernel */
    err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
        NULL, &event);

    /* Display potential enqueueing error */
    checkErr(err, "Enqueue");

    /* Wait until kernel has completed execution before continuing */
    event.wait();

    /* Read kernel result back into host memory */
    err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
        blockC, NULL, &event);

        checkErr(err, "C");

    err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
        blockA, NULL, &event);
    err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
        blockB, NULL, &event);

    /* Display potential kernel read error */
    checkErr(err, "Read Buffer");

    /* Display matrices */
        cout << endl;
        cout << "After:" << endl;
    printMatrix("Matrix A", blockA, sizeA, numColsA);
    printMatrix("Matrix B", blockB, sizeB, numColsB);
    printMatrix("Matrix C", blockC, sizeC, numColsC);

    /* Free up memory */
    free(blockA);
    free(blockB);
    free(blockC);
    }
}

//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
//            host program.
//
// PRE:  err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
//       indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {

    /* Check error code against OpenCL success constant */
    if (err != CL_SUCCESS) {

    /*
        Display an error message stating the error origin and
        error number.
    */
    std::cerr << "ERROR: " << name << " (" << err << ")"
              << std::endl;

    exit(EXIT_FAILURE); // Terminates process with status code 0
    }
}

//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
//              matrix.
//
// PRE:  matrix is a pointer to a block of bytes in memory; numIndices
//       is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
//       float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {

    /*
    Loop through the block of bytes, assigning a random float
    for each index of the matrix
    */
    for (int i = 0; i < numIndices; i++) {

    /* Assign a random float between 0 and 1 at this byte */
    matrix[i] = rand() / (float) RAND_MAX;
    }
}

//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE:  displayName is a string; matrix is a pointer to a block of
//       bytes in memory; numIndices an integer indicating the number
//       of indices in the matrix being displayed (read left-to-right,
//       top-to-bottom); rowSize is an integer indicating the number
//       of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
          int rowSize) {

    /* Output display name of matrix */
    cout << "\n" << displayName << ":" << endl;

    /* Loop through each indice of the matrix */
    for (int i = 0; i < numIndices; i++) {
    cout << matrix[i]; // Display value at this indice

    /* Check for next row of the matrix */
    if (((i + 1) % rowSize) == 0) {
        cout << endl; // Line break
    } else {
        cout << "  |  "; // Indice separator
    }
    }
}

核心：

// matrixMult1_kernels.cl
// Multiply two matrices A * B = C
// Device code.

// OpenCL Kernel
__kernel void
matrixMul(__global float* A, 
          __global float* B, 
      __global float* C, 
          int wA, int wB) {

   // 2D Thread ID
   int tx = get_local_id(0);
   int ty = get_local_id(1);

   // value stores the element 
   // that is computed by the thread
   float value = 0;

   for (int k = 0; k < wA; ++k)
   {
       float elementA = A[ty * wA + k];
       float elementB = B[k * wB + tx];
       value += elementA * elementB;
   }

   // Write the matrix to device memory each 
   // thread writes one element
   C[ty * wA + tx] = value;
}

样本输出：

矩阵 A：

0.398748 | 0.999793 | 0.206833

0.354238 | 0.674347 | 0.492022

0.707017 | 0.353635 | 0.430668

矩阵 B：

0.91598 | 0.0260167 | 0.881732

0.810974 | 0.193091 | 0.589857

0.229151 | 0.0657822 | 0.965835

错误：C (-30)

我正在使用仅支持 OpenCL 1.1 的 NVIDIA GeForce 9800 GT。在这里的任何帮助将不胜感激。

谢谢，

乔

score 2 · Accepted Answer

输入矩阵 A 和 B 的数据不会传递给设备。创建缓冲区时：

cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)

blockA参数被忽略，因为标志没有指定如何使用它。您需要至少添加CL_MEM_COPY_HOST_PTR以使用blockA的内容初始化缓冲区。

或者，您可以在创建缓冲区后调用clEnqueueWriteBuffer发送数据。

parallel-processing - OpenCL 矩阵乘法入队/缓冲区读取

1 回答 1

Related

Reference