0

OpenCL新手问题:)

我正在尝试编写一个opencl内核,例如

__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    //Do some stuff
}

在我尝试放入循环之前,它工作正常。即:

__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    for (int i = 0; i < 2; i++)
    {
        //Do some stuff
    }
}

这导致我的计算机崩溃显示器变黑。我认为问题在于我向显卡发送了太多工作。

我的完整代码如下所示

#ifdef cl_khr_fp64
    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
    #error "Double precision doubleing point not supported by OpenCL implementation."
#endif

int2 clipPixel(int2 coordinate, int width, int height)
{
    coordinate.x = max(0, coordinate.x);
    coordinate.y = max(0, coordinate.y);
    coordinate.x = min(width, coordinate.x); //1911
    coordinate.y = min(height, coordinate.y); //1071
    return coordinate;
}

int Coord2Index(int X, int Y, int width)
{
    return (width * Y) + X;
}



//2D Gaussian 'bubble' Function
double f(int x, int y, double a, double b, double s)
{
    return a + b*exp(-(x*x+y*y)/(s*s));
}

// (∂f/∂b)
double dfdb(int x, int y, double s)
{
    return exp(-(x*x+y*y)/(s*s));
}

// (∂f/∂σ)
double dfds(int x, int y, double b, double s)
{
    double v = -(x*x + y*y);
    return b * exp(v/(s*s))*-2*v/(s*s*s);
}

//Non-Linear Least Squares 
__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    int index = Coord2Index( x, y, 1912 );
    int jacIndex = 0;
    int dyIndex = 0;
    int indexRslt = Coord2Index( x, y, 1904 );

    double dY[81];
    double J[81][3];
    double JTJ[3][3];
    double3 B = (double3)(0, 1, 1); //initial guess
    double JTdY[3]; 

    //Creates the dY vector
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            dY[dyIndex] = image[index] - f( i, j, B.x, B.y, B.z);
            dyIndex = dyIndex + 1;
        }
    }

    //Creates the Jacobian 
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            index = Coord2Index( x + i + 4, y + j + 4, 1912 );

            J[jacIndex][0] = 1;
            J[jacIndex][1] = dfdb(i, j, B.z);
            J[jacIndex][2] = dfds(i, j, B.y, B.z);

            jacIndex = jacIndex + 1;
        }
    }

    //Now to solve (JT * J) * ΔB = JT * ΔY   for ΔB  ....

    JTdY[0] = 0;
    JTdY[1] = 0;
    JTdY[2] = 0;
    //Create JTJ
    for (int i = 0; i < 81; i++)
    {
        JTJ[0][0] = J[i][0] * J[i][0];
        JTJ[0][1] = J[i][0] * J[i][1];
        JTJ[0][2] = J[i][0] * J[i][2];

        JTJ[1][0] = J[i][1] * J[i][0];
        JTJ[1][1] = J[i][1] * J[i][1];
        JTJ[1][2] = J[i][1] * J[i][2];

        JTJ[2][0] = J[i][2] * J[i][0];
        JTJ[2][1] = J[i][2] * J[i][1];
        JTJ[2][2] = J[i][2] * J[i][2];

        //JT * ΔY
        JTdY[0] = J[i][0] * dY[i];
        JTdY[1] = J[i][1] * dY[i];
        JTdY[2] = J[i][2] * dY[i];
    }

    //TO DO: might have to make this next part more general if I decide not to use a 9x9 bubble template size
    // Also not sure what to do when det(A) = 0  (is that even possible?)

    // (JT * J) * ΔB = JT * ΔY is a system of the form Ax = b
    // A = (JT * J), ΔB = x, JT * ΔY = b
    //Solve using cramer's rule   http://en.wikipedia.org/wiki/Cramer%27s_rule
    // xi = det(Ai)/det(A)

    //determinant of A
    double detA = 
    JTJ[0][0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    double detA1 =
      JTdY[0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (  JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) + 
    JTJ[0][2] * (  JTdY[1] * JTJ[2][1] - JTJ[1][1] * JTdY[2]  ) ;

    double detA2 = 
    JTJ[0][0] * (JTdY[1]   * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) -
    JTdY[0]   * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) ;

    double detA3 = 
    JTJ[0][0] * (JTJ[1][1] * JTdY[2]   - JTdY[1]   * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) + 
    JTdY[0]   * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    // B(k+1) = B(k) + ΔB
    B.x = B.x + (detA1/detA);
    B.y = B.y + (detA2/detA);
    B.z = B.z + (detA3/detA);

    nllsqResult[indexRslt] = B.z;
}

我想这样使用for循环

//Non-Linear Least Squares 
__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    int index = Coord2Index( x, y, 1912 );
    int jacIndex = 0;
    int dyIndex = 0;
    int indexRslt = Coord2Index( x, y, 1904 );

    double dY[81];
    double J[81][3];
    double JTJ[3][3];
    double3 B = (double3)(0, 1, 1); //initial guess
    double JTdY[3]; 

    //Creates the dY vector
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            dY[dyIndex] = image[index] - f( i, j, B.x, B.y, B.z);
            dyIndex = dyIndex + 1;
        }
    }

      for (int iters = 0; iters < 10; iters++) //FOR LOOP ADDED HERE
      {
      jacIndex = 0;
    //Creates the Jacobian 
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            index = Coord2Index( x + i + 4, y + j + 4, 1912 );

            J[jacIndex][0] = 1;
            J[jacIndex][1] = dfdb(i, j, B.z);
            J[jacIndex][2] = dfds(i, j, B.y, B.z);

            jacIndex = jacIndex + 1;
        }
    }

    //Now to solve (JT * J) * ΔB = JT * ΔY   for ΔB  ....

    JTdY[0] = 0;
    JTdY[1] = 0;
    JTdY[2] = 0;
    //Create JTJ
    for (int i = 0; i < 81; i++)
    {
        JTJ[0][0] = J[i][0] * J[i][0];
        JTJ[0][1] = J[i][0] * J[i][1];
        JTJ[0][2] = J[i][0] * J[i][2];

        JTJ[1][0] = J[i][1] * J[i][0];
        JTJ[1][1] = J[i][1] * J[i][1];
        JTJ[1][2] = J[i][1] * J[i][2];

        JTJ[2][0] = J[i][2] * J[i][0];
        JTJ[2][1] = J[i][2] * J[i][1];
        JTJ[2][2] = J[i][2] * J[i][2];

        //JT * ΔY
        JTdY[0] = J[i][0] * dY[i];
        JTdY[1] = J[i][1] * dY[i];
        JTdY[2] = J[i][2] * dY[i];
    }

    //TO DO: might have to make this next part more general if I decide not to use a 9x9 bubble template size
    // Also not sure what to do when det(A) = 0  (is that even possible?)

    // (JT * J) * ΔB = JT * ΔY is a system of the form Ax = b
    // A = (JT * J), ΔB = x, JT * ΔY = b
    //Solve using cramer's rule   http://en.wikipedia.org/wiki/Cramer%27s_rule
    // xi = det(Ai)/det(A)

    //determinant of A
    double detA = 
    JTJ[0][0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    double detA1 =
      JTdY[0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (  JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) + 
    JTJ[0][2] * (  JTdY[1] * JTJ[2][1] - JTJ[1][1] * JTdY[2]  ) ;

    double detA2 = 
    JTJ[0][0] * (JTdY[1]   * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) -
    JTdY[0]   * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) ;

    double detA3 = 
    JTJ[0][0] * (JTJ[1][1] * JTdY[2]   - JTdY[1]   * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) + 
    JTdY[0]   * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    // B(k+1) = B(k) + ΔB
    B.x = B.x + (detA1/detA);
    B.y = B.y + (detA2/detA);
    B.z = B.z + (detA3/detA);
      }

    nllsqResult[indexRslt] = B.z;
}
4

1 回答 1

1

您的内核需要很长时间,并且 Windows 的超时检测和恢复机制会启动。您可以通过更改注册表值来禁用 TDR,如下所述:MSDN但是,如果您可以使用 TDR,您的屏幕可能会挂起,直到计算您的内核完成了。如果您的内核中有一个无限循环,那么没有什么可以阻止它,并且由于您的计算机没有任何响应,因此终止任务将非常困难。很好,有电源和重置按钮。

于 2013-10-18T06:01:17.137 回答