I tried to use the DeviceMapModule and the DeviceMap2Module for simple mapping operations. To my surprise it is about 2x slower than writing the kernel by hand. For the hand written kernel I did not do anything special. I just copied the Getting Started Square kernel and modified it to do sigmoid.
Below is 2 versions of the code:
First the slow version:
type SigmoidModule(size) =
inherit Module(size, size) // Note Module is my own abstract class that defines the forward/backward interface.
[<ReflectedDefinition; AOTCompile>]
let sigmoid' s y = (1.0f-s) * s * y
[<ReflectedDefinition; AOTCompile>]
let sigmoid x = 1.0f / (1.0f + __nv_expf (-x))
let forwardMap = new DeviceMapModule<float32, float32>(GPUModuleTarget.DefaultWorker, <@ sigmoid @>)
let backwardMap = new DeviceMap2Module<float32, float32, float32>(
GPUModuleTarget.DefaultWorker, <@ sigmoid' @>)
let mutable activation = Matrix.ZeroCreate 1 size
override m.Forward (fwd:Matrix) =
assert(fwd.Cols = size)
if activation.Rows <> fwd.Rows then
activation.Data.Dispose()
activation <- Matrix.ZeroCreate fwd.Rows fwd.Cols
forwardMap.Map(activation.Data.Ptr, fwd.Data.Ptr, fwd.Cols * fwd.Rows)
activation
override m.Backward (dy:Matrix) =
assert(dy.Cols = size)
assert(activation.Rows = dy.Rows)
backwardMap.Map(activation.Data.Ptr, activation.Data.Ptr, dy.Data.Ptr, dy.Cols * dy.Rows)
activation
interface IDisposable with
member m.Dispose() =
forwardMap.Dispose()
backwardMap.Dispose()
activation.Data.Dispose()
Now the fast version:
type SigmoidModuleFast(size) =
inherit Module(size, size)
let sigmoid' s y = (1.0f-s) * s * y
let worker = Worker.Default
[<ReflectedDefinition; AOTCompile>]
static let sigmoidKernel (outputs:deviceptr<float32>) (inputs:deviceptr<float32>) n =
let start = blockIdx.x * blockDim.x + threadIdx.x
let stride = gridDim.x * blockDim.x
let sigmoid x = 1.0f / (1.0f + __nv_expf (-x))
let mutable i = start
while i < n do
outputs.[i] <- sigmoid(inputs.[i])
i <- i + stride
[<ReflectedDefinition; AOTCompile>]
static let sigmoidPrimeKernel (outputs:deviceptr<float32>) (input:deviceptr<float32>) (dy:deviceptr<float32>) n =
let start = blockIdx.x * blockDim.x + threadIdx.x
let stride = gridDim.x * blockDim.x
let mutable i = start
while i < n do
let s = input.[i]
let y = dy.[i]
outputs.[i] <- (1.0f-s) * s * y
i <- i + stride
let mutable activation = Matrix.ZeroCreate 1 size
let mutable lp = LaunchParam(1, size)
override m.Forward (fwd:Matrix) =
assert(fwd.Cols = size)
if activation.Rows <> fwd.Rows then
activation.Data.Dispose()
activation <- Matrix.ZeroCreate fwd.Rows fwd.Cols
let threads = fwd.Rows * fwd.Cols
if threads < 1024 then
lp <- LaunchParam(1, threads)
else
let blockSize = 256
let numSm = worker.Device.Attributes.MULTIPROCESSOR_COUNT
let gridSize = Math.Min(16 * numSm, divup threads blockSize)
lp <- new LaunchParam(gridSize, blockSize)
worker.Launch <@ sigmoidKernel @> lp activation.Data.Ptr fwd.Data.Ptr (fwd.Cols*fwd.Rows)
activation
override m.Backward (dy:Matrix) =
assert(dy.Cols = size)
assert(activation.Rows = dy.Rows)
worker.Launch <@ sigmoidPrimeKernel @> lp activation.Data.Ptr activation.Data.Ptr dy.Data.Ptr (dy.Cols*dy.Rows)
activation
interface IDisposable with
member m.Dispose() =
activation.Data.Dispose()
Here is some information about my system:
- Name Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
- Architecture x64
- Frequency 3,300 MHz
- Number of Cores 4
- Page Size 4,096
- Total Physical Memory 8,050.00 MB
- Available Physical Memory 902.00 MB
- Hybrid Graphics Enabled False
- Version Name Windows 10 Pro
- Version Number 10.0.10586
- Nsight Version 5.1.0.16062
Visual Studio Version 14.0
GPU: NVidia GTX 980 Ti
- .NET CLR: .NET 4.6.