我一直在尝试在 SYCL 中实现简单的矩阵乘法,但是一旦内核启动,我总是会遇到分段错误。我的代码如下 -
class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
typedef cl::sycl::buffer<float, 1> sycl_buffer;
void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
queue deviceQueue, int numThreads){
/*
* Naive Matrix Multiplication of MxN and NxK
* */
std::cout<<"Starting Matrix Multiplication"<<std::endl;
nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(M / numThreads + 1, K / numThreads + 1),
cl::sycl::range<2>(numThreads, numThreads));
deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh){
auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
auto result_accessor = result.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
(nd_item<2> ndItem){
auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);
if(row_index < M && column_index < K){
float sum = 0.0f;
for (int i = 0; i < N; i++) {
sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
}
result_accessor[K * row_index + column_index] = sum;
}
});
});
deviceQueue.wait();
std::cout<<"Done with Matmul"<<std::endl;
}
int main() {
size_t M = 512;
size_t N = 512;
size_t K = 512;
auto matA = (float*) malloc(M * N * sizeof(float ));
auto matB = (float*) malloc(N * K * sizeof(float ));
auto result = (float*) malloc(M * K * sizeof(float ));
for (int i=0; i< M*N; i++)
matA[i] = 2.0f;
for (int i=0; i< N*K; i++)
matB[i] = 2.0f;
for (int i = 0; i < M*K; ++i)
result[i] = 69.0f;
queue Queue;
auto device = Queue.get_device();
auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;
auto thread_max = int(std::sqrt(max_work_group_size));
std::cout<<thread_max<<std::endl;
buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));
auto mata_shared = std::make_shared<buffer<float, 1>>(mata_buffer);
auto matb_shared = std::make_shared<buffer<float, 1>>(matb_buffer);
auto result_shared = std::make_shared<buffer<float, 1>>(result_buffer);
naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);
Queue.submit([result_shared, result](handler& cgh){
auto resultAccessor = result_shared->get_access<access::mode::read>(cgh);
cgh.copy(resultAccessor, result);
});
Queue.wait();
std::cout<<"Here";
for(int i=0; i<100; i++)
std::cout<<result[i]<<" ";
std::cout<<std::endl;
}
输出如下 -
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
Segmentation fault (core dumped)
我无法弄清楚分段错误的根源。任何帮助表示赞赏。
提前致谢
编辑 --g
作为编译器标志传递以获取调试符号,输出如下 -
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Aborted (core dumped)
并在 GDB 下运行它 - 这是输出
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./computecpp_test...
(gdb) r
Starting program: /home/atharva/CLionProjects/computecpp_test/computecpp_test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7066700 (LWP 18128)]
[New Thread 0x7ffff62e5700 (LWP 18133)]
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Thread 1 "computecpp_test" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb)
这是我的 CMake,仅供参考,以便您知道正在传递的编译器标志
cmake_minimum_required(VERSION 3.17)
project(computecpp_test)
set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_CXX_FLAGS -g)
set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
#include(FindComputeCpp)
find_package(ComputeCpp)
include_directories($(COMPUTECPP_INCLUDE_DIRECTORY))
add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)
更新 - 在调试期间,我将所有索引更改为 0,但仍然抛出分段错误(如果使用 -g 编译器标志,则无效对象错误),这让我相信数据访问不是问题,而是其他问题.
回溯如下 -
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007ffff73c8859 in __GI_abort () at abort.c:79
#2 0x00007ffff779d911 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3 0x00007ffff77a938c in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#4 0x00007ffff77a93f7 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#5 0x00007ffff77a96a9 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#6 0x00007ffff7c63d61 in void cl::sycl::detail::handle_sycl_log<cl::sycl::invalid_object_error>(std::unique_ptr<cl::sycl::detail::sycl_log, std::default_delete<cl::sycl::detail::sycl_log> >&&) ()
from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#7 0x00007ffff7c5d0bd in cl::sycl::detail::trigger_sycl_log(cl::sycl::log_type, char const*, int, int, cl::sycl::detail::cpp_error_code, cl::sycl::detail::context const*, char const*) ()
from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#8 0x000000000040ab25 in cl::sycl::program::create_program_for_kernel<naive_MatMul_kernel> (c=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/program.h:510
#9 0x000000000040552b in cl::sycl::handler::parallel_for_impl<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1}>(cl::sycl::detail::nd_range_base const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1} const&, int) (this=0x6b1d40, ndRange=..., functor=..., dimensions=2)
at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:423
#10 0x0000000000405485 in cl::sycl::handler::parallel_for<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1}, 2>(cl::sycl::nd_range<2> const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1} const&) (this=0x6b1d40, ndRange=..., functor=...)
at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:471
#11 0x000000000040536e in naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const (
this=0x7fffffffd500, cgh=...) at main.cpp:49
#12 0x000000000040518f in cl::sycl::detail::command_group::submit_handler<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0, std::shared_ptr<cl::sycl::detail::queue> const&, cl::sycl::detail::standard_handler_tag) (this=0x7fffffffd738, cgf=..., fallbackQueue=std::shared_ptr<class cl::sycl::detail::queue> (empty) = {...}) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/command_group.h:179
#13 0x000000000040391f in cl::sycl::queue::submit<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0) (this=0x7fffffffdaa8, cgf=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/queue.h:519
#14 0x00000000004037bb in naiveMatrixMultiplication (MatA=..., MatB=..., result=..., M=512, N=512, K=512, deviceQueue=..., numThreads=16) at main.cpp:42
#15 0x0000000000404adb in main () at main.cpp:220
本质上它停在program.h
文件中的这段代码处
COMPUTECPP_CL_ERROR_CODE_MSG(
CL_SUCCESS, detail::cpp_error_code::KERNEL_NOT_FOUND_ERROR,
c.get_impl().get(),
"Unable to retrieve kernel function, is integration header included?")
}
显然,它无法检索内核函数。