cuda - CUDA 上的线路检测

Question

我正在尝试使用 CUDA 进行实时线路检测。我已经计算了霍夫变换以及每个 bin 的最小、最大线坐标。为了获取线段，我正在跟踪（使用 Bresenham 的线算法）通过最小到最大点并获取每个 bin 上的线段。当霍夫阈值很低并且图像中有很多行时， trace_lines 需要很多时间才能完成。

在 GTX 660 上，霍夫变换 (hough_line_transform) 计算每帧 (1280x720) 大约需要 5-10 毫秒（观察到比 CPU 实现快 10 倍）。但是从最小、最大点跟踪线段需要 1ms-15ms。

我有两个关于线路检测的问题

是否存在更好的算法来从霍夫箱的最小、最大点获取线段？
是否可以进一步优化hought_line_transform（参见下面的代码）？我正在使用原子操作。是否有可能避免原子。

我附上下面的代码。

类头

#ifndef _HOUGH_LINES_H_
#define _HOUGH_LINES_H_

#include <cuda_gl_interop.h>
#include <thrust/device_vector.h>

union Pos;
struct Line;

struct Hough_params
{
    int w;
    int h;
    int r;
};

class Hough_lines
{
public:
    enum Type {INT, SHORT_INT, FLOAT};

    Hough_lines(int _w, int _h);
    ~Hough_lines();

public:
    bool init();
    bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);

protected:
    void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
    void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
    void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
    void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);

    static void compute_trig_funcs();

protected:
    Hough_params params;
    thrust::device_vector<Hough_params> d_param;

    static bool trig_init;
};

#endif

身体

#include <hough_lines.h>

#include <math.h>
#include <stdio.h>

#include <cuda.h>
#include <cuda_runtime_api.h>

#include <cuda_gl_interop.h>

#include <thrust/host_vector.h>

#include <thrust/copy.h>
#include <thrust/scan.h>

#define ANGLE_SIZE 360
#define MAX_LINE_PER_THREAD 10

union Pos
{
    struct
    {
        uint16_t x;
        uint16_t y;
    };
    uint32_t value;
};

struct Hough_info
{
    Pos end;
    Pos start;
    int count;
};

struct Line
{
    Pos start;
    Pos end;
};

struct Line_info
{
    int line_count;
    Line line[MAX_LINE_PER_THREAD];
};

__constant__ float dev_sint[ANGLE_SIZE];
__constant__ float dev_cost[ANGLE_SIZE];

texture<uint8_t, 2, cudaReadModeElementType> luma_tex;

bool Hough_lines::trig_init = false;

__global__ void mark_edges(const Hough_params* param, int* edge)
{
    int x = (blockIdx.x*blockDim.x+threadIdx.x);
    int y = (blockIdx.y*blockDim.y+threadIdx.y);
    int pos = x+(param->w*y);   
    edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
}

__global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
{
    int index;
    int x = (blockIdx.x*blockDim.x+threadIdx.x);
    int y = (blockIdx.y*blockDim.y+threadIdx.y);
    int pos = x+(param->w*y);   
    if (255 == tex2D(luma_tex, x, y))
    {
        index = edge[pos];
        coord[index].y = y;
        coord[index].x = x;
    }
}

__global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
{
    int i;
    int angle;
    int rdata;
    __shared__ Hough_info sh_rho_data[1001];

    i = threadIdx.x;
    while (i < param->r)
    {
        sh_rho_data[i].end.value = 0x0;
        sh_rho_data[i].start.value = 0xFFFFFFFF;
        sh_rho_data[i].count = 0;

        i += blockDim.x;
    }
    __syncthreads();

    i = threadIdx.x;
    angle = blockIdx.x;
    const float cos_angle = dev_cost[angle];
    const float sin_angle = dev_sint[angle];
    while (i < size)
    {
        rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
        if (rdata >= 0)
        {
            atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
            atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);

            atomicAdd(&sh_rho_data[rdata].count, 1);
        }
        i += blockDim.x;
    }

    __syncthreads();

    i = threadIdx.x;
    rdata = (angle*param->r);
    while (i < param->r)
    {
        memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
        mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
        i += blockDim.x;
    }
}

__global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
{
    int pos;
    int i = threadIdx.x;
    int offset = (blockIdx.x*param->r);
    while (i < param->r)
    {
        if (hdata[offset+i].count >= threshold)
        {
            pos = mark[offset+i];
            lines[pos].start.value = hdata[offset+i].start.value;
            lines[pos].end.value = hdata[offset+i].end.value;
        }
        i += blockDim.x;
    }
}

__device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
{
    int d = abs(xe-xs)+abs(ye-ys);
    if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
    {
        line->line[line->line_count].start.x = xs;
        line->line[line->line_count].start.y = ys;
        line->line[line->line_count].end.x = xe;
        line->line[line->line_count].end.y = ye;

        ++line->line_count;

        //printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
    }
} 

__global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
{
    int d;
    int dsub;
    int dstep;
    int xstep;
    int ystep;

    int xs, ys, xe, ye;
    int i = (blockIdx.x*blockDim.x+threadIdx.x);
    if (i >= inp_size)
    {
        return;
    }

    xs = input[i].start.x;
    ys = input[i].start.y;
    xe = input[i].end.x;
    ye = input[i].end.y;

    line_info[i].line_count = 0;

    int dx = abs(xe-xs);
    int dy = abs(ye-ys);
    int xinc = (xe > xs)?1:-1;
    int yinc = (ye > ys)?1:-1;

    int gap = 0;
    bool sflag;
    int s_x, s_y, e_x, e_y;

    if (dx > dy)
    {
        dsub = (dx<<1);
        dstep = (dy<<1);
        d = dstep-dx;
        xstep = xinc;
        ystep = 0;
        xinc = 0;
    }
    else
    {
        dsub = (dy<<1);
        dstep = (dx<<1);
        d = dstep-dy;
        xstep = 0;
        ystep = yinc;
        yinc = 0;
    }

    sflag = true;
    s_x = xs;
    s_y = ys;
    e_x = xs;
    e_y = ys;

    int x = xs;
    int y = ys;

    while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
    {
        x += xstep;
        y += ystep;
        if (d > 0)
        {
            x += xinc;
            y += yinc;

            d -= dsub;
        }
        d += dstep;

        if (255 == tex2D(luma_tex, x, y))
        {
            e_x = x;
            e_y = y;
            gap = 0;

            if (!sflag)
            {
                s_x = x;
                s_y = y;
                sflag = true;
            }
        }
        else if (sflag)
        {
            ++gap;
            if (gap >= min_gap)
            {
                sflag = false;
                add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
            }
        }
    }

    if (sflag)
    {
        add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
    }
    mark[i] = line_info[i].line_count;
}

__global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
{
    int index = (blockIdx.x*blockDim.x+threadIdx.x);
    if (index >= size)
    {
        return;
    }

    int pos;
    int start = 4*mark[index];
    Line* line_data = &line[index].line[0];
    for (int i = 0; i < line[index].line_count; i++)
    {
        pos = start+(4*i);
        coords[pos] = line_data[i].start.x-(param->w>>1);
        coords[pos+1] = (param->h>>1)-line_data[i].start.y;
        coords[pos+2] = line_data[i].end.x-(param->w>>1);
        coords[pos+3] = (param->h>>1)-line_data[i].end.y;
    }

    if ((index+1) == size)
    {
        *count = mark[index];
    }
}

Hough_lines::Hough_lines(int _w, int _h)
    :d_param(1)
{
    params.w = _w;
    params.h = _h;
    params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
    thrust::copy_n(&params, 1, d_param.begin());
}

Hough_lines::~Hough_lines()
{
}

bool Hough_lines::init()
{
    if (false == trig_init)
    {
        trig_init = true;
        compute_trig_funcs();
    }
    return true;
}

void Hough_lines::compute_trig_funcs()
{
    float theta;
    cudaError_t err = cudaSuccess;
    static float sint[ANGLE_SIZE];
    static float cost[ANGLE_SIZE];

    for (int i = 0; i < ANGLE_SIZE; i++)
    {
        theta = (M_PI*(float)i)/180.0;
        sint[i] = sin(theta);
        cost[i] = cos(theta);
    }

    err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
    err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
    if (cudaSuccess != err)
    {
        printf("\n%s", cudaGetErrorString(cudaGetLastError()));
    } 
}

void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
{
    dim3 bsize(16, 16);
    dim3 gsize(params.w/bsize.x, params.h/bsize.y);
    thrust::device_vector<int> d_mark(params.w*params.h);

    size = 0;
    mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
        thrust::raw_pointer_cast(d_mark.data()));
    thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
    get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
        thrust::raw_pointer_cast(d_mark.data()),
        thrust::raw_pointer_cast(d_coords.data()));
    thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}

void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
{
    int edge_count = 0;
    thrust::device_vector<Pos> d_coords(params.w*params.h); 
    get_edges(d_coords, edge_count);

    thrust::device_vector<int> d_mark(params.r*360);
    thrust::device_vector<Hough_info> d_hough_data(params.r*360);
    hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
        edge_count,
        thrust::raw_pointer_cast(d_coords.data()), threshold,
        thrust::raw_pointer_cast(d_mark.data()), 
        thrust::raw_pointer_cast(d_hough_data.data())); 
    thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
    ::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
        threshold,
        thrust::raw_pointer_cast(d_hough_data.data()),
        thrust::raw_pointer_cast(d_mark.data()),
        thrust::raw_pointer_cast(d_lines.data()));
    thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}

void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
{
    thrust::device_vector<int> d_mark_line(size);
    thrust::device_vector<Line_info> d_nlines(size);

    trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()), 
        size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
        thrust::raw_pointer_cast(d_mark_line.data()));

    thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());

    thrust::device_vector<int> d_count(1);
    copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
        thrust::raw_pointer_cast(d_nlines.data()), size,
        thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
        thrust::raw_pointer_cast(d_count.data()));

    thrust::copy(d_count.begin(), d_count.end(), &count);
    //printf("\nLine count: %d", count);
}

void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
    int* d_line_coord = 0;
    cudaGLRegisterBufferObject(line);
    cudaGLMapBufferObject((void **)&d_line_coord, line);

    int size = 0;
    thrust::device_vector<Line> d_lines(params.r*360); 
    get_hough_lines(threshold, d_lines, size);
    //printf("\nget_hough_lines: %d", size);

    trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);

    cudaGLUnmapBufferObject(line);
    cudaGLUnregisterBufferObject(line);
    }

    bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
    {
    cudaError_t err;
    cudaArray* array_edge;
    cudaGraphicsResource* res_edge;

    err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
    if (err != cudaSuccess)
    {
        printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
        exit(0);
    }

    cudaGraphicsMapResources(1, &res_edge);
    cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
    err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
    if (err != cudaSuccess)
    {
        printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
        exit(0);
    }

    if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
    {
        printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
        exit(0);
    }

    float time = 0.0;
    //static float max = 0.0;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    count = 0;
    get_lines(threshold, min_length, min_gap, line, type, count);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);

    //static int frame = 0;
    //frame++;
    //if (time > max)
    {
        //max = time;
        printf("\nElpased time: %f ms", time);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaUnbindTexture(luma_tex);   
    cudaGraphicsUnmapResources(1, &res_edge);
    cudaGraphicsUnregisterResource(res_edge);

    return true;
}

score 2 · Accepted Answer

在“前缀和及其应用”中，Guy Blelloch 描述了一种使用并行前缀和的并行画线算法。请参阅该论文的第 55 页，它可能会给您一些想法。

关于如何优化hough_line_transfer，我认为关键是消除循环中的共享内存原子。在您使用它们的地方，您实际上是在进行键控缩减。Thrust 提供了一个reduce_by_key函数，但它只能从主机调用。Thrust 对应的设备库是 CUB，但它没有reduce_by_key. 我已经向 CUB 作者询问了这里的想法，如果我们有任何想法，我会更新这个答案。

您可以编写自己的键控归约，但如果可能的话，依赖库会更有效率和更健壮。

cuda - CUDA 上的线路检测

1 回答 1

Related

Reference