1

我正在尝试使用 cuda 库为我的输入数据找到边界框。我从一个具有噪声(可能还有一些归零的单元格)的数据集开始,其数据区域远高于噪声。

首先,我使用 nppiFilterGauss_32f_C1R 对我的数据应用高斯模糊。

然后我使用 nppiCompareC_32f_C1R 对其进行阈值化以创建二进制图像。

在此之后,我使用 nppiLabelMarkers_8u32u_C1R 为每个区域创建一个唯一标签。

在这一点上,我的结果如我所料。我留下了一个数据集,它对每个“blob”都有唯一的值(尽管数字之间有数字间隔)。

我一直在网上寻找,似乎找不到一个库,然后可以在 GPU 上找到标记组件的边界框。

我能够使用 findContours 和 BoundingRects 获得使用 OpenCV 的完整流程,但这是在 CPU 上完成的工作,无法跟上我的数据速率。

是否有我缺少的 cuda 函数可以为我提供每个标记的 blob 的边界框参数?

谢谢!

4

1 回答 1

1

在标签标记操作之后,如果我们再压缩标签标记,我们可以实现一个相当简单的方法来识别边界框,使用atomicMaxatomicMin在一个简单的 CUDA 内核中。

这是一个工作示例:

$ cat t1461.cu
#include <stdio.h>
#include <nppi_filtering_functions.h>
#include <assert.h>
#define WIDTH 16
#define HEIGHT 16
void my_print(Npp16u *data, int w, int h){
  for (int i = 0; i < h; i++)
    {
    for (int j = 0; j < w; j++)
      {
      if (data[i*w+j] == 255) printf("  *");
      else printf("%3hd", data[i * w + j]);
      }
    printf("\n");
  }

}

template <typename T>
__global__ void bb(const T * __restrict__ i, int * __restrict__ maxh, int * __restrict__ minh, int * __restrict__ maxw, int * __restrict__ minw, int height, int width){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < width) && (idy < height)){
    T myval = i[idy*width+idx];
    if (myval > 0){
      atomicMax(maxw+myval-1, idx);
      atomicMin(minw+myval-1, idx);
      atomicMax(maxh+myval-1, idy);
      atomicMin(minh+myval-1, idy);}
  }
}

int main(){
Npp16u host_src[WIDTH * HEIGHT] =
{
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255,255, 0, 0,255, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0,255,255,255,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0,
0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

  Npp16u * device_src;
  cudaMalloc((void**)&device_src, sizeof(Npp16u) * WIDTH * HEIGHT);
  cudaMemcpy(device_src, host_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);

  int buffer_size;
  NppiSize source_roi = { WIDTH, HEIGHT };
  NppStatus e = nppiLabelMarkersGetBufferSize_16u_C1R(source_roi, &buffer_size);
  assert(e == NPP_NO_ERROR);
  Npp8u * buffer;
  cudaMalloc((void**)&buffer, buffer_size);

  int max;
  e = nppiLabelMarkers_16u_C1IR(device_src, sizeof(Npp16u) * WIDTH, source_roi, (Npp16u)1, nppiNormInf, &max, buffer);
  assert(e == NPP_NO_ERROR);
  printf("initial max: %d\n", max);
  int bs;
  e = nppiCompressMarkerLabelsGetBufferSize_16u_C1R (1, &bs);
  assert(e == NPP_NO_ERROR);
  if (bs>buffer_size){
    buffer_size = bs;
    cudaFree(buffer);
    cudaMalloc(&buffer, buffer_size);}
  e = nppiCompressMarkerLabels_16u_C1IR(device_src, sizeof(Npp16u)*WIDTH, source_roi, max, &max, buffer);
  assert(e == NPP_NO_ERROR);
  int *maxw, *maxh, *minw, *minh, *d_maxw, *d_maxh, *d_minw, *d_minh;
  maxw = new int[max];
  maxh = new int[max];
  minw = new int[max];
  minh = new int[max];
  cudaMalloc(&d_maxw, max*sizeof(int));
  cudaMalloc(&d_maxh, max*sizeof(int));
  cudaMalloc(&d_minw, max*sizeof(int));
  cudaMalloc(&d_minh, max*sizeof(int));
  for (int i = 0; i < max; i++){
    maxw[i] = 0;
    maxh[i] = 0;
    minw[i] = WIDTH;
    minh[i] = HEIGHT;}
  cudaMemcpy(d_maxw, maxw, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_maxh, maxh, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_minw, minw, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_minh, minh, max*sizeof(int), cudaMemcpyHostToDevice);
  dim3 block(32,32);
  dim3 grid((WIDTH+block.x-1)/block.x, (HEIGHT+block.y-1)/block.y);
  bb<<<grid, block>>>(device_src, d_maxh, d_minh, d_maxw, d_minw, HEIGHT, WIDTH);
  cudaMemcpy(maxw, d_maxw, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(maxh, d_maxh, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(minw, d_minw, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(minh, d_minh, max*sizeof(int), cudaMemcpyDeviceToHost);

  Npp16u *dst = new Npp16u[WIDTH * HEIGHT];
  cudaMemcpy(dst, device_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);

  printf("*******INPUT************\n");
  my_print(host_src, WIDTH, HEIGHT);
  printf("******OUTPUT************\n");
  my_print(dst, WIDTH,HEIGHT);
  printf("compressed max: %d\n", max);
  printf("bounding boxes:\n");
  for (int i = 0; i < max; i++)
    printf("label %d, maxh: %d, minh: %d, maxw: %d, minw: %d\n", i+1, maxh[i], minh[i], maxw[i], minw[i]);
}
$ nvcc -o t1461 t1461.cu -lnppif
$ cuda-memcheck ./t1461
========= CUDA-MEMCHECK
initial max: 10
*******INPUT************
  0  0  0  0  0  0  0  0  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  *  0  0  *  0  0  0
  0  0  0  0  0  0  0  *  *  *  0  0  0  *  *  *
  0  0  0  0  0  0  0  0  *  0  0  0  0  *  *  *
  0  0  0  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  *  *  *  0  0  0  0  *  0  0  0  0  0  0  0
  0  *  *  *  *  0  0  *  *  *  0  0  0  0  0  0
  0  0  *  *  *  0  *  *  *  *  *  0  0  0  0  0
  0  0  0  *  0  0  0  *  *  *  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  *  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  0  0  0
******OUTPUT************
  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  1  0  0  3  0  0  0
  0  0  0  0  0  0  0  1  1  1  0  0  0  3  3  3
  0  0  0  0  0  0  0  0  1  0  0  0  0  3  3  3
  0  0  0  4  0  0  0  0  0  0  0  0  0  3  3  3
  0  4  4  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  4  4  4  0  0  0  0  5  0  0  0  0  0  0  0
  0  4  4  4  4  0  0  5  5  5  0  0  0  0  0  0
  0  0  4  4  4  0  5  5  5  5  5  0  0  0  0  0
  0  0  0  4  0  0  0  5  5  5  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  5  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  0  0  0
compressed max: 7
bounding boxes:
label 1, maxh: 5, minh: 0, maxw: 9, minw: 6
label 2, maxh: 3, minh: 1, maxw: 3, minw: 1
label 3, maxh: 6, minh: 3, maxw: 15, minw: 12
label 4, maxh: 11, minh: 6, maxw: 4, minw: 1
label 5, maxh: 12, minh: 8, maxw: 10, minw: 6
label 6, maxh: 14, minh: 12, maxw: 15, minw: 13
label 7, maxh: 15, minh: 13, maxw: 3, minw: 1
========= ERROR SUMMARY: 0 errors
$

请注意,如果您要重复执行此操作(例如识别视频帧上的边界框),您将希望cudaMalloc大部分操作脱离性能循环。

一种典型的方法是使用我已经buffer在上面的代码中展示的分配方法。如果先前的大小太小,则仅释放并重新分配缓冲区。对于最大和最小缓冲区也是如此。

于 2019-08-14T01:33:00.417 回答