当我发布这个问题时,我并没有期待太多,但事实证明,对于诸如 128x10000 之类的大型矩阵来说,有一个更好的方法。我编写了一个块大小为 32 的 warpReduce 内核,它允许它使用 shuffle xor 进行整个缩减。
对于 128x100000 的 100 次迭代,第一个版本每个网格使用 64 个块(每个块 32 个线程)需要 0.5 秒。对于 CUB 行减少它需要 0.25 秒。
当我将每个网格的块数增加到 256 个时,我得到了将近 4 倍的加速,达到了大约 1.5 秒。每个线程 384 个块需要 1.1 秒,增加块的数量似乎并没有从那里提高性能。
对于我感兴趣的问题大小,改进并没有那么显着。
对于 128x1024 和 128x512 的情况,10000 次迭代:
对于 1024:1 秒 vs 0.82 秒,有利于 warpReduce。对于 512:0.914 秒 vs 0.873 秒,有利于 warpReduce。
对于小型矩阵,并行性带来的任何加速都会被内核启动时间所消耗。
对于 256:0.94 秒 vs 0.78 秒,有利于 warpReduce。对于 160:0.88 秒 vs 0.77 秒,有利于 warpReduce。
它使用 GTX 970 进行了测试。
对于 Kepler 和更早的 nVidia 卡,数据可能会有所不同,因为在 Maxwell 中,每个网格的块限制从 32 个提高到每个 SM 的 64 个,这提高了多处理器占用率。
我对此感到满意,因为性能改进很好,而且在达到 Cub 块减少之前,我实际上无法编写正确使用共享内存的内核。我忘记了 Cuda 有时有多痛苦。令人惊讶的是,不使用共享内存的版本如此具有竞争力。
以下是我测试的两个模块:
type rowModule(target) =
inherit GPUModule(target)
let grid_size = 64
let block_size = 128
let blockReducer = BlockReduce.RakingCommutativeOnly<float32>(dim3(block_size,1,1),worker.Device.Arch)
[<Kernel;ReflectedDefinition>]
member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
// Point block_start to where the column starts in the array.
let mutable col = blockIdx.x
let temp_storage = blockReducer.TempStorage.AllocateShared()
let column_mean_shared = __shared__.Variable()
while col < num_cols do
// i is the row index
let mutable row = threadIdx.x
let mutable acc = 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
acc <- acc + x.[idx]
// Increment the row index
row <- row + blockDim.x
let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then
means.[col] <- column_mean_partial
column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared
row <- threadIdx.x
acc <- 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
// Accumulate the variances.
acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
// Increment the row index
row <- row + blockDim.x
let variance_sum = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then stds.[col] <- sqrt(variance_sum)
col <- col + gridDim.x
member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
let lp = LaunchParam(grid_size, block_size)
this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr
type rowWarpModule(target) =
inherit GPUModule(target)
let grid_size = 384
let block_size = 32
[<Kernel;ReflectedDefinition>]
member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
// Point block_start to where the column starts in the array.
let mutable col = blockIdx.x
while col < num_cols do
// i is the row index
let mutable row = threadIdx.x
let mutable acc = 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
acc <- acc + x.[idx]
// Increment the row index
row <- row + blockDim.x
let inline butterflyWarpReduce (value:float32) =
let v1 = value + __shfl_xor value 16 32
let v2 = v1 + __shfl_xor v1 8 32
let v3 = v2 + __shfl_xor v2 4 32
let v4 = v3 + __shfl_xor v3 2 32
v4 + __shfl_xor v4 1 32
let column_mean = (butterflyWarpReduce acc) / (float32 num_rows)
row <- threadIdx.x
acc <- 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
// Accumulate the variances.
acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
// Increment the row index
row <- row + blockDim.x
let variance_sum = (butterflyWarpReduce acc) / (float32 num_rows)
if threadIdx.x = 0
then stds.[col] <- sqrt(variance_sum)
means.[col] <- column_mean
col <- col + gridDim.x
member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
let lp = LaunchParam(grid_size, block_size)
this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr