v0.1 - 天真的实现
这是我做这项工作的第一次天真的尝试:
__global__ void sliding_dot(float *out, int *outdims, float *X, int *Xdims, float *Y, int *Ydims )
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int Y_indx = 0;
int X_indx = 0;
if ( i < outdims[0] & j < outdims[1] )
{
int out_indx = j + i*outdims[1];
for (int Yi = 0; Yi < Ydims[0]; Yi++ )
{
for (int Yj = 0; Yj < Ydims[1]; Yj++ )
{
for (int k = 0; k < Ydims[2]; k++ )
{
Y_indx = k + Yj* Ydims[2] + Yi* Ydims[2]*Ydims[1];
X_indx = k + (j+Yj)*Xdims[2] + (i+Yi)*Xdims[2]*Xdims[1];
out[out_indx] += X[X_indx]*Y[Y_indx];
}
}
}
}
}
到目前为止,结果并不理想。选择块大小 (32,32,1) 和网格尺寸 p,q 使得 p*32 >= outdims[0] 和 q*32 >= outdims[1] :
method=[ sliding_dot ] gputime=[ 7013.280 ] cputime=[ 18.000 ] occupancy=[ 0.667 ]
method=[ sliding_dot ] gputime=[ 6945.184 ] cputime=[ 7.000 ] occupancy=[ 0.667 ]
method=[ sliding_dot ] gputime=[ 6990.816 ] cputime=[ 6.000 ] occupancy=[ 0.667 ]
method=[ sliding_dot ] gputime=[ 6931.648 ] cputime=[ 6.000 ] occupancy=[ 0.667 ]
v0.2 -texture<float,1>
我希望每个人都能像我一样从中学到很多东西!我遵循@aland 的建议并获得了相当大的提速:
texture<float,1> X;
texture<float,1> Y;
__global__ void dotconv(float *out, int2 outdims, int3 Xdims, int3 Ydims )
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
if ( i < outdims.x & j < outdims.y )
{
int out_indx = j + i*outdims.y;
float total = 0.0f;
int X_indx = 0;
int Y_indx = 0;
for (int Yi=0; Yi<Ydims.x; Yi++ )
{
for (int Yj=0; Yj<Ydims.y; Yj++ )
{
for (int k=0; k<Ydims.z; k++ )
{
Y_indx = k + Yj* Ydims.z + Yi* Ydims.z*Ydims.y;
X_indx = k + (j+Yj)*Xdims.z + (i+Yi)*Xdims.z*Xdims.y;
total += tex1Dfetch(X,X_indx)*tex1Dfetch(Y,Y_indx);
}
}
}
out[out_indx] = total;
}
}
但是我们的运行速度仍然没有 CPU 快:
method=[ dotconv ] gputime=[ 2224.928 ] cputime=[ 24.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2222.592 ] cputime=[ 7.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2225.216 ] cputime=[ 10.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2222.752 ] cputime=[ 10.000 ] occupancy=[ 0.667 ]
v0.3 -texture<float,3>
texture<float,3,cudaReadModeElementType> X;
texture<float,3,cudaReadModeElementType> Y;
__global__ void dotconv(float *out, int2 outdims, int3 Xdims, int3 Ydims )
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
if ( i < outdims.x & j < outdims.y )
{
int out_indx = j + i*outdims.y;
float total = 0.0f;
for (int Yi=0; Yi<Ydims.x; Yi++ )
{
for (int Yj=0; Yj<Ydims.y; Yj++ )
{
for (int k=0; k<Ydims.z; k++ )
{
total += tex3D(X,k,j+Yj,i+Yi) * tex3D(Y,k,Yj,Yi);
}
}
}
out[out_indx] = total;
}
}
这实际上比 v0.2 慢一点
method=[ dotconv ] gputime=[ 2403.360 ] cputime=[ 35.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2392.160 ] cputime=[ 15.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2396.448 ] cputime=[ 15.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2398.880 ] cputime=[ 16.000 ] occupancy=[ 0.667 ]
感谢您的建议!