我们有一个相当单一的内核(见下文),我们用一个网格启动,块 1,1
内核<<<1,1>>>
然后它会动态地触发许多较小的内核。一般来说,数据从内核流向内核,输入从第一个内核开始,流向最后。
但是我们已经确定了一种潜在的能力,可以重叠两个数据流,每个数据流都运行这个相同的内核。
问题:我们是否必须放弃动态内核执行,转而采用基于主机的方法来获得两个巨型内核的执行重叠?或者卡中的调度程序是否足够智能,可以在两个超级内核之间交错执行并将每个超级内核作为单独的调度项处理?
我们谈论的是特斯拉 K80。Linux 主机。
(是的,我们将获得一些重叠与 cudamemcopyasync() 重叠执行,但我们也希望看到一些执行重叠)。
#include <cuda.h>
#include <cuda_runtime.h>
#include "coss_types.h"
#include "image.h"
#include "centroid.h"
#include "gpu.h"
#define GPU_TILE_WIDTH 16
#define GPU_TILE_HEIGHT 16
#define GPU_TILE_WBIG 32
#define GPU_TILE_HBIG 32
#define K_IMG_MAX 1024
__constant__ unsigned short* pFrameStack[GPU_CHX];
__constant__ unsigned short* pBackground[GPU_CHX];
__constant__ short* pCleanground[GPU_CHX];
__constant__ unsigned char* pMask[GPU_CHX];
__constant__ float* pForeground[GPU_CHX];
__constant__ float* pLowground[GPU_CHX];
__constant__ float* pLowgroundRow[GPU_CHX];
__constant__ float* pHighground[GPU_CHX];
__constant__ float* pHighgroundRow[GPU_CHX];
__constant__ float* pMins[GPU_CHX];
__constant__ float* pMaxs[GPU_CHX];
__constant__ int gSlot;
__constant__ int* pPercentile[GPU_CHX];
__constant__ int* pLabels1[GPU_CHX];
__constant__ int* pLabels2[GPU_CHX];
__constant__ int* pRawLabels[GPU_CHX];
__constant__ int* pLabels[GPU_CHX];
__constant__ ImgInfoBlock_t* pInfo[GPU_CHX];
__constant__ unsigned short* pSums[GPU_CHX];
__constant__ unsigned short* pBlockSums[GPU_CHX];
__constant__ ImgCentroid_t* pCenters[GPU_CHX];
__constant__ float threshold_sigma = 9.0f;
/* INCLUDED GENERATED CUDA CODE BELOW */
#include "cuda.cu"
/* INCLUDED GENERATED CUDA CODE ABOVE */
extern "C" __device__ void Background(int ch)
{
dim3 block;
dim3 grid;
/* Background Estimation */
block = dim3(128);
grid = dim3((IMG_PIXELS)/256); /* Only half screens at a time */
gMedian<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
/* Background Removal */
block = dim3(128);
grid = dim3((IMG_PIXELS)/128);
gScrub<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,
pCleanground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
}
extern "C" __device__ void Convolution(int ch)
{
dim3 block;
dim3 grid;
dim3 block_b;
dim3 grid_b;
/* Convolve Rows */
block = dim3(GPU_TILE_WIDTH,GPU_TILE_HEIGHT);
grid = dim3(IMG_WIDTH/GPU_TILE_WIDTH,IMG_HEIGHT/GPU_TILE_HEIGHT);
gConvolveRow<<<grid,block>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
block_b = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid_b = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gConvolveBigRow<<<grid_b,block_b>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
/* Convolve Cols */
cudaDeviceSynchronize();
gConvolveCol<<<grid,block>>>(
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pLowground[ch], IMG_HEIGHT,IMG_WIDTH);
gConvolveBigCol<<<grid_b,block_b>>>(
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH);
/* Band pass */
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gBpass<<<grid,block>>>(
pLowground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Threshold(int ch)
{
dim3 block;
dim3 grid;
/* Set the calibration sigma in Info Bloc */
pInfo[ch]->sigma = threshold_sigma;
/* Min Max kernels */
block = dim3(512, 2);
grid = dim3(IMG_WIDTH / 512, IMG_HEIGHT / 2);
gMinMax<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gMinMaxMinMax<<<grid,K_IMG_MAX>>>(
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
/* Histogram */
cudaDeviceSynchronize();
block = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gHistogram<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pPercentile[ch],K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gSumHistogram<<<grid,block>>>(pPercentile[ch],K_IMG_MAX);
cudaDeviceSynchronize();
gIQR<<<grid,block>>>(pPercentile[ch],K_IMG_MAX,(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gThreshold<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
}
extern "C" __device__ void Gluing(int ch)
{
dim3 block;
dim3 grid;
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gGlue<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Labeling(int ch)
{
dim3 block;
dim3 grid;
/* CCL */
//block = dim3(1, 128);
//grid = dim3(1, IMG_HEIGHT / 128);
block = dim3(256,1);
grid = dim3(IMG_WIDTH/256,IMG_HEIGHT);
gCCL0<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Relabeling(int ch)
{
dim3 block;
dim3 grid;
/* Relabel */
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gScan<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gSum<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
grid = dim3(1);
gSumBlocks<<<grid, K_IMG_MAX>>>(pBlockSums[ch], 5*K_IMG_MAX, (struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gFixSums<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gRelabeler<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS,
pLabels[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Centroiding(int ch)
{
dim3 block;
dim3 grid;
int starcount = IMG_STARS_MAX;
if (pInfo[ch]->starCount > 0 && pInfo[ch]->starCount < IMG_STARS_MAX)
{
starcount = pInfo[ch]->starCount;
/* Centroid */
block = dim3(32, 32);
grid = dim3(IMG_WIDTH / 32, IMG_HEIGHT / 32);
gCentroid<<<grid, block>>>(
pLabels[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
(PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
block = dim3(starcount);
gCentroidFinal<<<1, block>>>((PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
}
else
{
pInfo[ch]->starCount = 0;
}
}
extern "C" __global__ void gPipeline(int gpuId)
{ int ch;
for(ch=0; ch < GPU_CHX; ch++)
{
Background(ch);
Convolution(ch);
Threshold(ch);
Gluing(ch);
Labeling(ch);
Relabeling(ch);
Centroiding(ch);
}
}
extern "C" {
static void ImgKernel_ClearBuffers(int32_t gpu, int32_t ch)
{
/* Clear Work Buffers */
cudaMemset(gInfo[gpu][ch],0,(int)sizeof(ImgInfoBlock_t));
cudaMemset(gCenters[gpu][ch],0,(int)sizeof(ImgCentroid_t)*IMG_STARS_MAX);
cudaMemset(gPercentile[gpu][ch],0,(int)sizeof(int32_t)*K_IMG_MAX);
cudaMemset(gLabels1[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gLabels2[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gRawLabels[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gSums[gpu][ch],0,(int)IMG_BYTES);
cudaMemset(gBlockSums[gpu][ch],0,(int)sizeof(uint16_t)*5*K_IMG_MAX);
}
void ImgKernel_Pipeline(int gpu)
{
cudaSetDevice(gpu);
cudaDeviceSynchronize();
/* Start a new run by clearing the buffers */
ImgKernel_ClearBuffers(gpu,GPU_CH0);
ImgKernel_ClearBuffers(gpu,GPU_CH1);
/* Update Constants */
cudaMemcpyToSymbol(gSlot,(void*)&slot,sizeof(slot));
cudaMemcpyToSymbol(threshold_sigma,(void*)&sigmaThreshold,sizeof(sigmaThreshold));
/* Start the next pipeline kernel */
gPipeline<<<1,1>>>(gpu);
}
#define LFILTER_LEN 15
static float lFilter[LFILTER_LEN] = { .0009f, .01f,
.02f, .05f, .08f, .10f, .1325f, .1411f, .1325f, .10f, .08f, .05f, .02f, .01f, .0009f };
#define HFILTER_LEN 31
static float hFilter[HFILTER_LEN] = {0.0002f, 0.0006f,
0.0025f, 0.0037f, 0.0053f, 0.0074f, 0.0099f, 0.0130f, 0.0164f,
0.0201f, 0.0239f, 0.0275f, 0.0306f, 0.0331f, 0.0347f, 0.0353f,
0.0347f, 0.0331f, 0.0306f, 0.0275f, 0.0239f, 0.0201f, 0.0164f,
0.0130f, 0.0099f, 0.0074f, 0.0053f, 0.0037f, 0.0025f, 0.0006f, 0.0002f};
static float32_t kernel[LFILTER_LEN];
static float32_t kernelBig[HFILTER_LEN];
static inline float32_t ImgKernel_FilterSum(float* arr, int32_t len)
{
int32_t i;
float32_t sum = 0.0f;
for (i=0;i<len;i++) sum += arr[i];
return sum;
}
void ImgKernel_Setup(int gpu)
{
int32_t i,ch;
float32_t sum = 0;
sum = ImgKernel_FilterSum(lFilter,LFILTER_LEN);
for (i = 0; i < LFILTER_LEN; i++) kernel[i] = lFilter[i] / sum;
sum = ImgKernel_FilterSum(hFilter,HFILTER_LEN);
for (i = 0; i < HFILTER_LEN; i++) kernelBig[i] = hFilter[i] / sum;
/* One time copy of locations into GPU constant memory */
cudaMemcpyToSymbol(gkernel, (void*)&kernel, sizeof(float32_t)*LFILTER_LEN);
cudaMemcpyToSymbol(gkernelBig, (void*)&kernelBig, sizeof(float32_t)*HFILTER_LEN);
cudaMemcpyToSymbol(pFrameStack,(void*)&gFrameStack[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBackground,(void*)&gBackground[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCleanground,(void*)&gCleanground[gpu][0], sizeof(int16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowground, (void*)&gLowground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowgroundRow,(void*)&gLowgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighground,(void*)&gHighground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighgroundRow,(void*)&gHighgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pForeground,(void*)&gForeground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMask, (void*)&gMask[gpu][0], sizeof(uint8_t*)*GPU_CHX);
cudaMemcpyToSymbol(pPercentile,(void*)&gPercentile[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMins, (void*)&gMins[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMaxs, (void*)&gMaxs[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels1, (void*)&gLabels1[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels2, (void*)&gLabels2[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pRawLabels, (void*)&gRawLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels, (void*)&gLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pInfo, (void*)&gInfo[gpu][0], sizeof(ImgInfoBlock_t*)*GPU_CHX);
cudaMemcpyToSymbol(pSums, (void*)&gSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBlockSums, (void*)&gBlockSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCenters, (void*)&gCenters[gpu][0], sizeof(ImgCentroid_t*)*GPU_CHX);
for (ch = 0; ch < GPU_CHX; ch++)
{
/* Clear the working buffers */
ImgKernel_ClearBuffers(gpu,ch);
}
}
}