0

I am using Nvidia Nsight to debug my code and I noticed that my distance kernel is sometimes not launching. The code attempts to find the distance between two polygons. It does this by going through each segment of each polygon and finding the distance between the two segments. The following is my code:

__device__ double point_segment_distance(double px, double py, double x1, double y1, double x2, double y2)
{
    double dx = x2 - x1;
    double dy = y2 - y1;
    //
    //if (dx < 0.01 && dy < 0.01)
    //{
    //  return hypot(px - x1, py - y1);
    //}
    //double t = ((px - x1) * dx + (py - y1) * dy) / (dx * dx + dy * dy);

    //if (t < 0)
    //{
    //  dx = px - x1;
    //  dy = py - y1;
    //}
    //else if (t > 1)
    //{
    //  dx = px - x2;
    //  dy = py - y2;
    //}
    //else
    //{
    //  double near_x = x1 + t * dx;
    //  double near_y = y1 + t * dy;
    //  dx = px - near_x;
    //  dy = py - near_y;
    //}

    //return hypot(dx, dy);
    return 10.0;
}

__device__ bool segments_intersect(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
    double dx1 = x12 - x11;
    double dy1 = y12 - y11;
    double dx2 = x22 - x21;
    double dy2 = y22 - y21;
    double delta = dx2 * dy1 - dy2 * dx1;
    if (delta < 0.01)
    {
        return false;
    }
    double s = (dx1 * (y21 - y11) + dy1 * (x11 - x21)) / delta;
    double t = (dx2 * (y11 - y21) + dy2 * (x21 - x11)) / (-delta);
    return (0 <= s && s <= 1 && 0 <= t && t <= 1);
}

__device__ double segments_distance(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
    if (segments_intersect(x11, y11, x12, y12, x21, y21, x22, y22))
    {
        return 0.0;
    }
    double minimumDist = 999999;
    double tempDist = point_segment_distance(x11, y11, x21, y21, x22, y22);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    tempDist = point_segment_distance(x12, y12, x21, y21, x22, y22);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    tempDist = point_segment_distance(x21, y21, x11, y11, x12, y12);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    tempDist = point_segment_distance(x22, y22, x11, y11, x12, y12);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    return minimumDist;
}

__global__ void distance(double *x0, double *y0, double *x1, double *y1, double *dist, int *length0, int *length1, int *numDone)
{
    int numComp = threadIdx.x + blockDim.x*blockIdx.x + *numDone;
    int index = threadIdx.x + blockDim.x*blockIdx.x;
    dist[index] = 99999;
    if  (numComp < ((*length0)*(*length1)))
    {
        int spot0 = numComp%(*length0);
        int spot1 = numComp/(*length0);
        dist[index] = segments_distance(x0[spot0], y0[spot0], x0[(spot0+1)%(*length0)], y0[(spot0+1)%(*length0)], x1[spot1], y1[spot1], x1[(spot1+1)%(*length1)], y1[(spot1+1)%(*length1)]);
    }
}

void gpuDistance(double *x0, double *y0, double *x1, double *y1)
{
    ...
    distance<<<165, 1024>>>(dev_x0, dev_y0, dev_x1, dev_y1, dev_dist, dev_length0, dev_length1, dev_numDone);
    ...
}

I commented out much of point_segment_distance in order to help me locate the error. This will not launch the distance kernel. I know this because I am using Nsight Cuda Debugging and it doesn't hit my breakpoints.

However, if I comment the line "double dy = y2 - y1;" in point_segment_distance the distance kernel will launch. How is this possible? Why would creating one more double cause the kernel to not launch. Is there a limit to the number of doubles that may be created on the GPU. I have a Tesla 2075. I am aware of the local memory limit of 512kb. However, looking at my code I can't imagine that I'm anywhere near that limit. Thanks for any help!

4

1 回答 1

2

没有遇到断点并不意味着内核没有被执行,因为编译器可以自由地对代码进行积极的优化。要检查内核启动的正确性,您应该更好地执行 talonmies 帖子意义上的规范 CUDA 错误检查

使用 CUDA 运行时 API 检查错误的规范方法是什么?

要了解编译器可以执行的优化,请考虑以下代码

__global__ void point_segment_distance(double* distance_squared, const double* __restrict__ x1, const double* __restrict__ y1, const double* __restrict__ x2, const double* __restrict__ y2)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    double dx = x2[i] - x1[i];
    double dy = y2[i] - y1[i];

    //distance_squared[i] = dx*dx+dy*dy;
}

请注意注释说明。当说明这样的指令时,内核函数中的所有内容都变成了死代码,因为它不会对全局内存数据做出贡献,并被编译器消除。事实上,反汇编代码变成

/*0000*/        MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */ 
/*0008*/        EXIT ;                 /* 0x8000000000001de7 */ 

当上述指令未注释时,编译器将产生

/*0000*/        MOV R1, c[0x1][0x100];                /* 0x2800440400005de4 */
/*0008*/        S2R R2, SR_CTAID.X;                   /* 0x2c00000094009c04 */
/*0010*/        S2R R3, SR_TID.X;                     /* 0x2c0000008400dc04 */
/*0018*/        MOV32I R0, 0x8;                       /* 0x1800000020001de2 */
/*0020*/        IMAD R18, R2, c[0x0][0x8], R3;        /* 0x2006400020249ca3 */  
/*0028*/        IMAD R8.CC, R18, R0, c[0x0][0x38];    /* 0x20018000e1221ca3 */
/*0030*/        IMAD.HI.X R9, R18, R0, c[0x0][0x3c];  /* 0x20808000f1225ce3 */
/*0038*/        IMAD R16.CC, R18, R0, c[0x0][0x40];   /* 0x2001800101241ca3 */
/*0040*/        LD.E.64 R10, [R8];                    /* 0x8400000000829ca5 */
/*0048*/        IMAD.HI.X R17, R18, R0, c[0x0][0x44]; /* 0x2080800111245ce3 */
/*0050*/        IMAD R12.CC, R18, R0, c[0x0][0x30];   /* 0x20018000c1231ca3 */
/*0058*/        LD.E.64 R4, [R16];                    /* 0x8400000001011ca5 */
/*0060*/        IMAD.HI.X R13, R18, R0, c[0x0][0x34]; /* 0x20808000d1235ce3 */
/*0068*/        IMAD R6.CC, R18, R0, c[0x0][0x28];    /* 0x20018000a1219ca3 */
/*0070*/        LD.E.64 R2, [R12];                    /* 0x8400000000c09ca5 */
/*0078*/        IMAD.HI.X R7, R18, R0, c[0x0][0x2c];  /* 0x20808000b121dce3 */
/*0080*/        LD.E.64 R14, [R6];                    /* 0x8400000000639ca5 */
/*0088*/        DADD R2, R4, -R2;                     /* 0x4800000008409d01 */
/*0090*/        DMUL R6, R2, R2;                      /* 0x5000000008219c01 */
/*0098*/        DADD R4, R10, -R14;                   /* 0x4800000038a11d01 */
/*00a0*/        IMAD R2.CC, R18, R0, c[0x0][0x20];    /* 0x2001800081209ca3 */
/*00a8*/        DFMA R4, R4, R4, R6;                  /* 0x200c000010411c01 */
/*00b0*/        IMAD.HI.X R3, R18, R0, c[0x0][0x24];  /* 0x208080009120dce3 */
/*00b8*/        ST.E.64 [R2], R4;                     /* 0x9400000000211ca5 */
/*00c0*/        EXIT ;                                /* 0x8000000000001de7 */

并且代码不再死了。

于 2013-10-22T20:45:05.900 回答