在标签标记操作之后,如果我们再压缩标签标记,我们可以实现一个相当简单的方法来识别边界框,使用atomicMax
和atomicMin
在一个简单的 CUDA 内核中。
这是一个工作示例:
$ cat t1461.cu
#include <stdio.h>
#include <nppi_filtering_functions.h>
#include <assert.h>
#define WIDTH 16
#define HEIGHT 16
void my_print(Npp16u *data, int w, int h){
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
if (data[i*w+j] == 255) printf(" *");
else printf("%3hd", data[i * w + j]);
}
printf("\n");
}
}
template <typename T>
__global__ void bb(const T * __restrict__ i, int * __restrict__ maxh, int * __restrict__ minh, int * __restrict__ maxw, int * __restrict__ minw, int height, int width){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < width) && (idy < height)){
T myval = i[idy*width+idx];
if (myval > 0){
atomicMax(maxw+myval-1, idx);
atomicMin(minw+myval-1, idx);
atomicMax(maxh+myval-1, idy);
atomicMin(minh+myval-1, idy);}
}
}
int main(){
Npp16u host_src[WIDTH * HEIGHT] =
{
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255,255, 0, 0,255, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0,255,255,255,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0,
0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
Npp16u * device_src;
cudaMalloc((void**)&device_src, sizeof(Npp16u) * WIDTH * HEIGHT);
cudaMemcpy(device_src, host_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);
int buffer_size;
NppiSize source_roi = { WIDTH, HEIGHT };
NppStatus e = nppiLabelMarkersGetBufferSize_16u_C1R(source_roi, &buffer_size);
assert(e == NPP_NO_ERROR);
Npp8u * buffer;
cudaMalloc((void**)&buffer, buffer_size);
int max;
e = nppiLabelMarkers_16u_C1IR(device_src, sizeof(Npp16u) * WIDTH, source_roi, (Npp16u)1, nppiNormInf, &max, buffer);
assert(e == NPP_NO_ERROR);
printf("initial max: %d\n", max);
int bs;
e = nppiCompressMarkerLabelsGetBufferSize_16u_C1R (1, &bs);
assert(e == NPP_NO_ERROR);
if (bs>buffer_size){
buffer_size = bs;
cudaFree(buffer);
cudaMalloc(&buffer, buffer_size);}
e = nppiCompressMarkerLabels_16u_C1IR(device_src, sizeof(Npp16u)*WIDTH, source_roi, max, &max, buffer);
assert(e == NPP_NO_ERROR);
int *maxw, *maxh, *minw, *minh, *d_maxw, *d_maxh, *d_minw, *d_minh;
maxw = new int[max];
maxh = new int[max];
minw = new int[max];
minh = new int[max];
cudaMalloc(&d_maxw, max*sizeof(int));
cudaMalloc(&d_maxh, max*sizeof(int));
cudaMalloc(&d_minw, max*sizeof(int));
cudaMalloc(&d_minh, max*sizeof(int));
for (int i = 0; i < max; i++){
maxw[i] = 0;
maxh[i] = 0;
minw[i] = WIDTH;
minh[i] = HEIGHT;}
cudaMemcpy(d_maxw, maxw, max*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_maxh, maxh, max*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_minw, minw, max*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_minh, minh, max*sizeof(int), cudaMemcpyHostToDevice);
dim3 block(32,32);
dim3 grid((WIDTH+block.x-1)/block.x, (HEIGHT+block.y-1)/block.y);
bb<<<grid, block>>>(device_src, d_maxh, d_minh, d_maxw, d_minw, HEIGHT, WIDTH);
cudaMemcpy(maxw, d_maxw, max*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(maxh, d_maxh, max*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(minw, d_minw, max*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(minh, d_minh, max*sizeof(int), cudaMemcpyDeviceToHost);
Npp16u *dst = new Npp16u[WIDTH * HEIGHT];
cudaMemcpy(dst, device_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);
printf("*******INPUT************\n");
my_print(host_src, WIDTH, HEIGHT);
printf("******OUTPUT************\n");
my_print(dst, WIDTH,HEIGHT);
printf("compressed max: %d\n", max);
printf("bounding boxes:\n");
for (int i = 0; i < max; i++)
printf("label %d, maxh: %d, minh: %d, maxw: %d, minw: %d\n", i+1, maxh[i], minh[i], maxw[i], minw[i]);
}
$ nvcc -o t1461 t1461.cu -lnppif
$ cuda-memcheck ./t1461
========= CUDA-MEMCHECK
initial max: 10
*******INPUT************
0 0 0 0 0 0 0 0 * 0 0 0 0 0 0 0
0 * * * 0 0 * * * 0 0 0 0 0 0 0
0 * * * 0 0 * * * 0 0 0 0 0 0 0
0 * * * 0 0 * * * * 0 0 * 0 0 0
0 0 0 0 0 0 0 * * * 0 0 0 * * *
0 0 0 0 0 0 0 0 * 0 0 0 0 * * *
0 0 0 * 0 0 0 0 0 0 0 0 0 * * *
0 * * 0 0 0 0 0 0 0 0 0 0 0 0 0
0 * * * 0 0 0 0 * 0 0 0 0 0 0 0
0 * * * * 0 0 * * * 0 0 0 0 0 0
0 0 * * * 0 * * * * * 0 0 0 0 0
0 0 0 * 0 0 0 * * * 0 0 0 0 0 0
0 0 0 0 0 0 0 0 * 0 0 0 0 * * *
0 * * * 0 0 0 0 0 0 0 0 0 * * *
0 * * * 0 0 0 0 0 0 0 0 0 * * *
0 * * * 0 0 0 0 0 0 0 0 0 0 0 0
******OUTPUT************
0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0 2 2 2 0 0 1 1 1 0 0 0 0 0 0 0
0 2 2 2 0 0 1 1 1 0 0 0 0 0 0 0
0 2 2 2 0 0 1 1 1 1 0 0 3 0 0 0
0 0 0 0 0 0 0 1 1 1 0 0 0 3 3 3
0 0 0 0 0 0 0 0 1 0 0 0 0 3 3 3
0 0 0 4 0 0 0 0 0 0 0 0 0 3 3 3
0 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0
0 4 4 4 0 0 0 0 5 0 0 0 0 0 0 0
0 4 4 4 4 0 0 5 5 5 0 0 0 0 0 0
0 0 4 4 4 0 5 5 5 5 5 0 0 0 0 0
0 0 0 4 0 0 0 5 5 5 0 0 0 0 0 0
0 0 0 0 0 0 0 0 5 0 0 0 0 6 6 6
0 7 7 7 0 0 0 0 0 0 0 0 0 6 6 6
0 7 7 7 0 0 0 0 0 0 0 0 0 6 6 6
0 7 7 7 0 0 0 0 0 0 0 0 0 0 0 0
compressed max: 7
bounding boxes:
label 1, maxh: 5, minh: 0, maxw: 9, minw: 6
label 2, maxh: 3, minh: 1, maxw: 3, minw: 1
label 3, maxh: 6, minh: 3, maxw: 15, minw: 12
label 4, maxh: 11, minh: 6, maxw: 4, minw: 1
label 5, maxh: 12, minh: 8, maxw: 10, minw: 6
label 6, maxh: 14, minh: 12, maxw: 15, minw: 13
label 7, maxh: 15, minh: 13, maxw: 3, minw: 1
========= ERROR SUMMARY: 0 errors
$
请注意,如果您要重复执行此操作(例如识别视频帧上的边界框),您将希望cudaMalloc
大部分操作脱离性能循环。
一种典型的方法是使用我已经buffer
在上面的代码中展示的分配方法。如果先前的大小太小,则仅释放并重新分配缓冲区。对于最大和最小缓冲区也是如此。