I'm new to CUDA and I'm probably doing something wrong.
All I need is logical operation on two binary vectors. Length of vectors is 2048000. I compared speed between logical and
in Matlab's C mex file and in CUDA kernel. C on CPU is ~5% faster than CUDA. Please note that I measured only kernel execution (without memory transfer). I have i7 930 and 9800GT.
##MEX file testCPU.c:##
#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] ) {
int i, varLen;
unsigned char *vars, *output;
vars = mxGetPr(prhs[0]);
plhs[0] = mxCreateLogicalMatrix(2048000, 1);
output = mxGetPr(plhs[0]);
for (i=0;i<2048000;i++){
output[i] = vars[i] & vars[2048000+i];
}
}
Compile
mex testCPU.c
Create vectors
vars = ~~(randi(2,2048000,2)-1);
Measure speed:
tic;testCPU(vars);toc;
CUDA:
#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"
__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx] = in[idx] && in[idx+N];
}
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] ) {
int i;
unsigned char *vars, *output, *gpu, *gpures;
vars = (unsigned char*)mxGetData(prhs[0]);
plhs[0] = mxCreateLogicalMatrix(2048000, 1);
output = (unsigned char*)mxGetData(plhs[0]);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float dt_ms;
// input GPU malloc
cudaEventRecord(start, 0);
cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU input malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
// output GPU malloc
cudaEventRecord(start, 0);
cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU output malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
// copy from CPU to GPU
cudaEventRecord(start, 0);
cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("copy input from CPU to GPU: %f ms, %i\n", dt_ms, cudaGetLastError());
dim3 dimBlock(32);
printf("thread count: %i\n", dimBlock.x);
dim3 dimGrid(2048000/dimBlock.x);
printf("block count: %i\n", dimGrid.x);
// --- KERNEL ---
cudaEventRecord(start, 0);
logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU kernel: %f ms, %i\n", dt_ms, cudaGetLastError());
// result from GPU to CPU
cudaEventRecord(start, 0);
cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("copy output from GPU to CPU: %f ms, %i\n", dt_ms, cudaGetLastError());
cudaFree(gpu);
cudaFree(gpures);
}
Compile:
nvmex -f nvmexopts_9.bat testGPU.cu
-I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include"
-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\lib\x64" -lcudart -lcufft
Output:
GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0
Is that code OK? CPU was ~0.1ms faster than CUDA kernel. I tried different thread counts (multipliers of 32) up to 512, 32 was fastest. Operator & instead of && was almost 1ms slower.
Is 9800GT really so weak? What speed-up can I expect with today's mainstream card (ie. GTX460,560)?
Thank you
EDIT: based on talonmies' comment, I made these modifications:
Kernel function:
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx].x = in[idx].x & in[idx+N].x;
out[idx].y = in[idx].y & in[idx+N].y;
out[idx].z = in[idx].z & in[idx+N].z;
out[idx].w = in[idx].w & in[idx+N].w;
}
Main function:
uchar4 *gpu, *gpures;
// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);
Output:
GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0
Is that correct? Almost 30x speed-up! It seems too good to be true, but result is correct :) How faster will be GTX560 on this particular task? Thx
Edit 2:
Is this code
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx].x = in[idx].x & in[idx+N].x;
out[idx].y = in[idx].y & in[idx+N].y;
out[idx].z = in[idx].z & in[idx+N].z;
out[idx].w = in[idx].w & in[idx+N].w;
}
automatically transformed to:
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
uchar4 buff;
buff.x = in[idx].x;
buff.y = in[idx].y;
buff.z = in[idx].z;
buff.w = in[idx].w;
buff.x &= in[idx+N].x;
buff.y &= in[idx+N].y;
buff.z &= in[idx+N].z;
buff.w &= in[idx+N].w;
out[idx].x = buff.x;
out[idx].y = buff.y;
out[idx].z = buff.z;
out[idx].w = buff.w;
}
by compiler?
If it is correct, it explains my confusion about coalesced access. I thought that in[idx] & in[idx+N]
leads to non-coalesced access, because of accessing non-contiguous memory. But in fact, in[idx]
and in[idx+N]
are loaded in two coalesced steps. N
can be any multiple of 16, because uchar4 is 4 bytes long, and for coalesced access address must be aligned to 64 bytes (on 1.1 device). Am I right?