0

我尝试在 Sycl 和 OneAPI 中编写 2D 互相关。这个想法是编写一种 Map 骨架,它包装 OneAPI 调用,通过一些指定目标类型(CPU 或 GPU/加速器)的参数隐藏硬件目标问题。

这是我的地图类:

    //Definition of Map Skeleton
template<class Tin, class Tout, class Function>
class Map {
private:
    Function fun;
public:
    Map() {
    }
    Map(Function f) :
            fun(f) {
    }
    //Overriding () operator
    std::vector<std::vector<Tout>> operator()(bool use_tbb,
            std::vector<std::vector<Tin>> &img,
            std::vector<std::vector<Tin>> &ker) {
        int img_row = img.size();
        int img_col = img[0].size();
        int filt_row = ker.size();
        int filt_col = ker[0].size();
        int out_row = img_row - filt_row;
        int out_col = img_col - filt_col;
        std::vector<std::vector<Tout>> out;

        if (use_tbb) {
            uTimer *timer = new uTimer("Executing Code On CPU");
            tbb::parallel_for(
                    tbb::blocked_range2d<int, int>(0, out_row, 0, out_col),
                    [&](tbb::blocked_range2d<int, int> &t) {
                        for (int n = t.rows().begin(); n < t.rows().end();
                                ++n) {
                            for (int m = t.cols().begin(); m < t.cols().end();
                                    ++m) {
                                out[n][m] = fun(
                                        slice_matrix(img, n, m, filt_row,
                                                filt_col), ker);
                            }
                        }
                    });
            timer->~uTimer();
            return out;
        } else {

            /*change 2D Matrices to the 1D linear arrays,
             *
             *and operate on them as contiguous blocks */
            size_t M = img_row + img_col;
            size_t N = filt_row + filt_col;
            //size_t O = out_row + out_col;
            size_t O_row = out_row;
            size_t O_col = out_col;
            std::vector<Tin> img_host;
            std::vector<Tin> ker_host;
            std::vector<Tout> out_gpu;

            /* A 2D std::vector<std::vector<T>>
             * does not have elements stored contiguously in the memory.
             * Thus I define a vector<T> and operate on them as contiguous blocks.*/

            //Define Buffer for
            sycl::buffer<Tin, 1> img_buffer(img_host.data(), M);
            sycl::buffer<Tin, 1> ker_buffer(ker_host.data(), N);
            sycl::buffer<Tin, 2> out_buffer(out_gpu.data(), sycl::range<2> {
                    O_row, O_col });

            //Profiling GPU

            // Initialize property list with profiling information
            sycl::property_list propList {
                    sycl::property::queue::enable_profiling() };
            // Build the command queue (constructed to handle event profling)
            sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),
                    propList);
            // print out the device information used for the kernel code
            std::cout << "Device: "
                    << gpuQueue.get_device().get_info<sycl::info::device::name>()
                    << std::endl;

            std::cout << "Compute Units: "
                    << gpuQueue.get_device().get_info<
                            sycl::info::device::max_compute_units>()
                    << std::endl;

            auto start_overall = std::chrono::system_clock::now();

            auto event = gpuQueue.submit(
                    [&](sycl::handler &h) {
                        //local copy of fun
                        auto f = fun;
                        sycl::accessor img_accessor(img_buffer, h,
                                sycl::read_only);
                        sycl::accessor ker_accessor(ker_buffer, h,
                                sycl::read_only);
                        sycl::accessor out_accessor(out_buffer, h,
                                sycl::write_only);
                        h.parallel_for(sycl::range<2> { O_row, O_col },
                                [=](sycl::id<2> index) {
                                    int row = index[0];
                                    int col = index[1];
                                    out_accessor[row][col] = f(
                                            slice_matrix(img_accessor, O_row,
                                                    O_col, filt_row, filt_col),
                                            ker_accessor);

                                });

                    });

            event.wait();
            auto end_overall = std::chrono::system_clock::now();
            cl_ulong submit_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_submit>();
            cl_ulong start_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_start>();
            cl_ulong end_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_end>();
            auto submission_time = (start_time - submit_time) / 1000000.0f;
            std::cout << "Submit Time: " << submission_time << " ms"
                    << std::endl;
            auto execution_time = (end_time - start_time) / 1000000.0f;
            std::cout << "Execution Time: " << execution_time << " ms"
                    << std::endl;
            auto execution_overall = std::chrono::duration_cast<
                    std::chrono::milliseconds>(end_overall - start_overall);
            std::cout << "Overall Execution Time: " << execution_overall.count()
                    << " ms" << std::endl;
        }
        ;
        return out;
    }

};

这是我的 slice_matrix:

    //Function which Slice a specific part of my matricx
template<class T>
std::vector<std::vector<T>> slice_matrix(std::vector<std::vector<T>> mat, int i,
        int j, int r, int c) {

    std::vector<std::vector<T>> out(r, std::vector<T>(c, 0));

    for (int k = 0; k < r; k++) {
        std::vector<T> temp(mat[i + k].begin() + j, mat[i + k].begin() + j + c);
        out[k] = temp;
    }

    return out;
}
;

问题是,在 Sycl 部分里面的 parallel-for

out_accessor[row][col] = f(
                                            slice_matrix(img_accessor, O_row,
                                                    O_col, filt_row, filt_col),
                                            ker_accessor);

                                });

该程序向我显示了一个错误,即:

no matching function for call to 'slice_matrix'

我试图将我的 slice_matrix 放在 Map 类中,但没有任何改变。我还考虑了 Sycl 关于“本规范定义的 SYCL 设备代码不支持虚函数调用”的限制,所以我定义了 slice_matrix 的本地副本,但我又遇到了错误。

我无法理解如何解决此错误。

4

1 回答 1

0

您将sycl::accessor类型传递给slice_matrix,但 slice_matrix 的签名

//Function which Slice a specific part of my matricx
template<class T>
std::vector<std::vector<T>> slice_matrix(std::vector<std::vector<T>> mat, int i, int j, int r, int c) 

所以签名不匹配...

您将需要一个带有访问器对象而不是向量的slice_matrix版本。

于 2021-07-15T13:09:37.880 回答