您可以使用async_work_group_strided_copy()
OpenCL API 调用。
感谢@DarkZeros 的评论,这是 pyopencl 中的一个小例子。让我们假设一个 RGB 图像的小条纹,像这样 4 x 1 表示:
img = np.array([58, 83, 39, 157, 190, 199, 64, 61, 5, 214, 141, 6])
并且您想访问四个红色通道,即 [58 157 64 214] 您会这样做:
def test_asyc_copy_stride_to_local(self):
#Create context, queue, program first
....
#number of R channels
nb_of_el = 4
img = np.array([58, 83, 39, 157, 190, 199, 64, 61, 5, 214, 141, 6])
cl_input = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img)
#buffer used to check if the copy is correct
cl_output = cl.Buffer(ctx, mf.WRITE_ONLY, size=nb_of_el * np.dtype('int32').itemsize)
lcl_buf = cl.LocalMemory(nb_of_el * np.dtype('int32').itemsize)
prog.asynCopyToLocalWithStride(queue, (nb_of_el,), None, cl_input, cl_output, lcl_buf)
result = np.zeros(nb_of_el, dtype=np.int32)
cl.enqueue_copy(queue, result, cl_output).wait()
print result
内核:
kernel void asynCopyToLocalWithStride(global int *in, global int *out, local int *localBuf){
const int idx = get_global_id(0);
localBuf[idx] = 0;
//copy 4 elements, the stride = 3 (RGB)
event_t ev = async_work_group_strided_copy(localBuf, in, 4, 3, 0);
wait_group_events (1, &ev);
out[idx] = localBuf[idx];
}