opencl - SYCL 内核中的分段错误

Question

我一直在尝试在 SYCL 中实现简单的矩阵乘法，但是一旦内核启动，我总是会遇到分段错误。我的代码如下 -

class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
typedef cl::sycl::buffer<float, 1> sycl_buffer;


void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
                               queue deviceQueue, int numThreads){

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout<<"Starting Matrix Multiplication"<<std::endl;
    nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(M / numThreads + 1, K / numThreads + 1),
                                           cl::sycl::range<2>(numThreads, numThreads));

    deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh){

        auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
        auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
        auto result_accessor = result.get_access<access::mode::read_write>(cgh);


        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem){

            auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
            auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);

            if(row_index < M && column_index < K){
                float sum = 0.0f;
                for (int i = 0; i < N; i++) {
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
                }
                result_accessor[K * row_index + column_index] = sum;
            }
        });
    });
    deviceQueue.wait();
    std::cout<<"Done with Matmul"<<std::endl;
}
 

int main() {

    size_t M  = 512;
    size_t N = 512;
    size_t K = 512;

    auto matA = (float*) malloc(M * N * sizeof(float ));
    auto matB = (float*) malloc(N * K * sizeof(float ));
    auto result =  (float*) malloc(M * K * sizeof(float ));

    for (int i=0; i< M*N; i++)
         matA[i] = 2.0f;
    for (int i=0; i< N*K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M*K; ++i)
        result[i] = 69.0f;

    queue Queue;

    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
    std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;
    auto thread_max  = int(std::sqrt(max_work_group_size));
    std::cout<<thread_max<<std::endl;


    buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
    buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
    buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));

    auto mata_shared = std::make_shared<buffer<float, 1>>(mata_buffer);
    auto matb_shared = std::make_shared<buffer<float, 1>>(matb_buffer);
    auto result_shared = std::make_shared<buffer<float, 1>>(result_buffer);

    naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);

    Queue.submit([result_shared, result](handler& cgh){
       auto resultAccessor = result_shared->get_access<access::mode::read>(cgh);
       cgh.copy(resultAccessor, result);
    });
    Queue.wait();

    std::cout<<"Here";

    for(int i=0; i<100; i++)
        std::cout<<result[i]<<"  ";
    std::cout<<std::endl;

}

输出如下 -

Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
Segmentation fault (core dumped)

我无法弄清楚分段错误的根源。任何帮助表示赞赏。

提前致谢

编辑 --g作为编译器标志传递以获取调试符号，输出如下 -

Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Aborted (core dumped)

并在 GDB 下运行它 - 这是输出

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./computecpp_test...
(gdb) r
Starting program: /home/atharva/CLionProjects/computecpp_test/computecpp_test 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7066700 (LWP 18128)]
[New Thread 0x7ffff62e5700 (LWP 18133)]
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'

Thread 1 "computecpp_test" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb)

这是我的 CMake，仅供参考，以便您知道正在传递的编译器标志

cmake_minimum_required(VERSION 3.17)
project(computecpp_test)

set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_CXX_FLAGS -g)

set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
#include(FindComputeCpp)
find_package(ComputeCpp)

include_directories($(COMPUTECPP_INCLUDE_DIRECTORY))

add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)

更新 - 在调试期间，我将所有索引更改为 0，但仍然抛出分段错误（如果使用 -g 编译器标志，则无效对象错误），这让我相信数据访问不是问题，而是其他问题.

回溯如下 -

#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007ffff73c8859 in __GI_abort () at abort.c:79
#2  0x00007ffff779d911 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3  0x00007ffff77a938c in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00007ffff77a93f7 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#5  0x00007ffff77a96a9 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#6  0x00007ffff7c63d61 in void cl::sycl::detail::handle_sycl_log<cl::sycl::invalid_object_error>(std::unique_ptr<cl::sycl::detail::sycl_log, std::default_delete<cl::sycl::detail::sycl_log> >&&) ()
   from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#7  0x00007ffff7c5d0bd in cl::sycl::detail::trigger_sycl_log(cl::sycl::log_type, char const*, int, int, cl::sycl::detail::cpp_error_code, cl::sycl::detail::context const*, char const*) ()
   from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#8  0x000000000040ab25 in cl::sycl::program::create_program_for_kernel<naive_MatMul_kernel> (c=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/program.h:510
#9  0x000000000040552b in cl::sycl::handler::parallel_for_impl<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1}>(cl::sycl::detail::nd_range_base const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1} const&, int) (this=0x6b1d40, ndRange=..., functor=..., dimensions=2)
    at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:423
#10 0x0000000000405485 in cl::sycl::handler::parallel_for<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1}, 2>(cl::sycl::nd_range<2> const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1} const&) (this=0x6b1d40, ndRange=..., functor=...)
    at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:471
#11 0x000000000040536e in naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const (
    this=0x7fffffffd500, cgh=...) at main.cpp:49
#12 0x000000000040518f in cl::sycl::detail::command_group::submit_handler<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0, std::shared_ptr<cl::sycl::detail::queue> const&, cl::sycl::detail::standard_handler_tag) (this=0x7fffffffd738, cgf=..., fallbackQueue=std::shared_ptr<class cl::sycl::detail::queue> (empty) = {...}) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/command_group.h:179
#13 0x000000000040391f in cl::sycl::queue::submit<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0) (this=0x7fffffffdaa8, cgf=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/queue.h:519
#14 0x00000000004037bb in naiveMatrixMultiplication (MatA=..., MatB=..., result=..., M=512, N=512, K=512, deviceQueue=..., numThreads=16) at main.cpp:42
#15 0x0000000000404adb in main () at main.cpp:220

本质上它停在program.h文件中的这段代码处

      COMPUTECPP_CL_ERROR_CODE_MSG(
          CL_SUCCESS, detail::cpp_error_code::KERNEL_NOT_FOUND_ERROR,
          c.get_impl().get(),
          "Unable to retrieve kernel function, is integration header included?")
    }

显然，它无法检索内核函数。

score 3 · Accepted Answer

以下是我在您的代码中发现的一些问题：

以下：

 Queue.submit([result_shared, result](handler& cgh){
   auto resultAccessor = result_shared->get_access<access::mode::read(cgh);
   cgh.copy(resultAccessor, result);
 });
 Queue.wait();

没用，因为sycl::buffers 旨在为您进行同步。一旦缓冲区被破坏，您就可以保证将内存复制回主机（否则我相信它处于未定义状态）。

您已将缓冲区声明为buffer<float, 1>，这意味着您的 SYCL 缓冲区包含基础数据的类型。构建缓冲区时，您只需要传递元素的数量而不是其大小（以字节为单位）。这就是为什么您的代码在提交内核时会崩溃（这是设备发生隐式复制的地方）。

写吧：

buffer<float, 1> mata_buffer(matA, range<1>(M * N));
buffer<float, 1> matb_buffer(matB, range<1>(N * K));
buffer<float, 1> result_buffer(result, range<1>(M * K));

事实证明，您从中获得的默认队列queue Queue;不一定是主机设备。在某些实现中，此行为允许使用环境变量更改您正在运行的设备。在我的实现中，queue Queue;返回给我一个 GPU，而您的原始代码失败（因为它需要执行上述复制）。但是，当在主机设备上运行时queue Queue{host_selector{}};不起作用，因为我正在运行的 SYCL 实现不执行，希望从主机到主机的 memcpy。
您正在使用max_work_group_size它，就好像您认为它是真正的工作组规模一样。不是，它只是一个提示，实际上可以是从 0 到 2**64-1 的任何值。考虑做一些边界检查。
你在你的nd_range<2>. 签名是：

sycl::nd_range<2>(sycl::range<2> globalSize, sycl::range<2> localSize);

的每个维度都globalSize应该是中每个维度的倍数localSize。

所以你应该做

auto local_range = sycl::range<2>(numThreads, numThreads);
auto global_range = sycl::range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
sycl::nd_range<2> launchParams = nd_range<2>(global_range, local_range);

乘法的目的nd_range是获得您的设备将处理的“真实”全局范围，因为它可能比您预期的要大一些。

最后一点：我不太确定为什么要将缓冲区包装在共享指针中。首先，它们不是“重型结构”，它是一个不保存内存的包装器。您可能已经注意到它甚至不需要设备。此外，从不同的地方访问单个缓冲区（我猜是共享指针的目的）可能会导致 UB。

最后你不需要手动进行偏移计算，你可以使用

row_index = ndItem.get_global_id(0);

有了这些建议，您的代码是：


void naiveMatrixMultiplication(float* MatA, float* MatB, float* result, size_t M, size_t N, size_t K, queue deviceQueue, size_t numThreads) {

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout << "Starting Matrix Multiplication" << std::endl;

    buffer<float, 1> mata_buffer(MatA, range<1>(M * N));
    buffer<float, 1> matb_buffer(MatB, range<1>(N * K));
    buffer<float, 1> result_buffer(result, range<1>(M * K));

    auto local_range = range<2>(numThreads, numThreads);
    auto global_range = range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
    auto launchParams = nd_range<2>(global_range, local_range);

    deviceQueue.submit([&, M, N, K, launchParams](handler &cgh) {
        auto MatA_accessor = mata_buffer.get_access<access::mode::read>(cgh);
        auto MatB_accessor = matb_buffer.get_access<access::mode::read>(cgh);
        auto result_accessor = result_buffer.get_access<access::mode::write>(cgh);
        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem) {

            auto column_index = ndItem.get_global_id(1);
            auto row_index = ndItem.get_global_id(0);

            if (row_index < M && column_index < K) {
                float sum = 0.0f;
                for (int i = 0; i < N; i++) {
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[i * N + column_index];
                }
                result_accessor[K * row_index + column_index] = sum;
            }
        });
    });
    deviceQueue.wait();
    std::cout << "Done with Matmul" << std::endl;
}


int main() {
    size_t M = 512;
    size_t N = 512;
    size_t K = 512;
    auto matA = (float *) malloc(M * N * sizeof(float));
    auto matB = (float *) malloc(N * K * sizeof(float));
    auto result = (float *) malloc(M * K * sizeof(float));

    for (int i = 0; i < M * N; i++)
        matA[i] = 2.0f;
    for (int i = 0; i < N * K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M * K; ++i)
        result[i] = 69.0f;

    queue Queue{gpu_selector{}};

    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<info::device::max_work_group_size>();
    std::cout << device.get_info<info::device::name>() << std::endl;
    auto thread_max = std::sqrt(max_work_group_size);
    std::cout << thread_max << std::endl;

    naiveMatrixMultiplication(matA, matB, result, M, N, K, Queue, thread_max);
    std::cout << "Here";

    for (int i = 0; i < 100; i++)
        std::cout << result[i] << "  ";
    std::cout << std::endl;
}

编辑：我要补充一点，在 computecpp-sdk 存储库中有一个用 SYCL 编写的矩阵乘法示例（以获得更多灵感）。

opencl - SYCL 内核中的分段错误

1 回答 1

Related

Reference