cuda - 在使用来自 Cuda Unbound 的 blockReduce 后，向所有线程广播值的有效方法是什么？

Question

let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)

if threadIdx.x = 0 then 
    means.[col] <- column_mean_partial
    column_mean_shared := column_mean_partial

__syncthreads()

let column_mean = !column_mean_shared

上面的代码片段计算了二维矩阵每一列的平均值。由于块中只有线程 0 具有完整值，因此我将其存储到共享内存中column_mean_shared，使用__syncthreads()然后将其广播到块中的所有线程，因为我需要它们具有该值才能计算方差。

是否有更好的方法来传播价值，或者上述方法是否已经足够有效？

score 1 · Accepted Answer

当我发布这个问题时，我并没有期待太多，但事实证明，对于诸如 128x10000 之类的大型矩阵来说，有一个更好的方法。我编写了一个块大小为 32 的 warpReduce 内核，它允许它使用 shuffle xor 进行整个缩减。

对于 128x100000 的 100 次迭代，第一个版本每个网格使用 64 个块（每个块 32 个线程）需要 0.5 秒。对于 CUB 行减少它需要 0.25 秒。

当我将每个网格的块数增加到 256 个时，我得到了将近 4 倍的加速，达到了大约 1.5 秒。每个线程 384 个块需要 1.1 秒，增加块的数量似乎并没有从那里提高性能。

对于我感兴趣的问题大小，改进并没有那么显着。

对于 128x1024 和 128x512 的情况，10000 次迭代：

对于 1024：1 秒 vs 0.82 秒，有利于 warpReduce。对于 512：0.914 秒 vs 0.873 秒，有利于 warpReduce。

对于小型矩阵，并行性带来的任何加速都会被内核启动时间所消耗。

对于 256：0.94 秒 vs 0.78 秒，有利于 warpReduce。对于 160：0.88 秒 vs 0.77 秒，有利于 warpReduce。

它使用 GTX 970 进行了测试。

对于 Kepler 和更早的 nVidia 卡，数据可能会有所不同，因为在 Maxwell 中，每个网格的块限制从 32 个提高到每个 SM 的 64 个，这提高了多处理器占用率。

我对此感到满意，因为性能改进很好，而且在达到 Cub 块减少之前，我实际上无法编写正确使用共享内存的内核。我忘记了 Cuda 有时有多痛苦。令人惊讶的是，不使用共享内存的版本如此具有竞争力。

以下是我测试的两个模块：

type rowModule(target) =
    inherit GPUModule(target)

    let grid_size = 64
    let block_size = 128

    let blockReducer = BlockReduce.RakingCommutativeOnly<float32>(dim3(block_size,1,1),worker.Device.Arch)

    [<Kernel;ReflectedDefinition>]
    member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
        // Point block_start to where the column starts in the array.
        let mutable col = blockIdx.x
        let temp_storage = blockReducer.TempStorage.AllocateShared()

        let column_mean_shared = __shared__.Variable()

        while col < num_cols do
            // i is the row index
            let mutable row = threadIdx.x
            let mutable acc = 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                acc <- acc + x.[idx]
                // Increment the row index
                row <- row + blockDim.x

            let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)

            if threadIdx.x = 0 then 
                means.[col] <- column_mean_partial
                column_mean_shared := column_mean_partial

            __syncthreads()

            let column_mean = !column_mean_shared

            row <- threadIdx.x
            acc <- 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                // Accumulate the variances.
                acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
                // Increment the row index
                row <- row + blockDim.x

            let variance_sum = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
            if threadIdx.x = 0 then stds.[col] <- sqrt(variance_sum)

            col <- col + gridDim.x

    member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
        let lp = LaunchParam(grid_size, block_size)
        this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr

type rowWarpModule(target) =
    inherit GPUModule(target)

    let grid_size = 384
    let block_size = 32

    [<Kernel;ReflectedDefinition>]
    member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
        // Point block_start to where the column starts in the array.
        let mutable col = blockIdx.x

        while col < num_cols do
            // i is the row index
            let mutable row = threadIdx.x
            let mutable acc = 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                acc <- acc + x.[idx]
                // Increment the row index
                row <- row + blockDim.x

            let inline butterflyWarpReduce (value:float32) = 
                let v1 = value + __shfl_xor value 16 32
                let v2 = v1 + __shfl_xor v1 8 32
                let v3 = v2 + __shfl_xor v2 4 32
                let v4 = v3 + __shfl_xor v3 2 32
                v4 + __shfl_xor v4 1 32

            let column_mean = (butterflyWarpReduce acc) / (float32 num_rows)

            row <- threadIdx.x
            acc <- 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                // Accumulate the variances.
                acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
                // Increment the row index
                row <- row + blockDim.x

            let variance_sum = (butterflyWarpReduce acc) / (float32 num_rows)
            if threadIdx.x = 0 
            then stds.[col] <- sqrt(variance_sum)
                 means.[col] <- column_mean

            col <- col + gridDim.x

    member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
        let lp = LaunchParam(grid_size, block_size)
        this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr

cuda - 在使用来自 Cuda Unbound 的 blockReduce 后，向所有线程广播值的有效方法是什么？

1 回答 1

Related

Reference