0

如何计算如下函数执行的周期数。我应该直接计算 sum 和 muls 和 divs 的数量吗?我在哪里可以检查在 CUDA 中添加需要多少个周期?

__global__
void mandelbrotSet_per_element(Grayscale *image){
    float minR = -2.0f, maxR = 1.0f;
    float minI = -1.2f, maxI = minI + (maxR-minR) * c_rows / c_cols;
    float realFactor = (maxR - minR) / (c_cols-1);
    float imagFactor = (maxI - minI) / (c_rows-1);

    bool isInSet;
    float c_real, c_imag, z_real, z_imag;

    int y = blockDim.y * blockIdx.y + threadIdx.y;
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    while (y < c_rows){
        while (x < c_cols) {
            c_real = minR + x * realFactor;
            c_imag = maxI - y * imagFactor;
            z_real = c_real;    z_imag = c_imag;
            isInSet = true;

            for (int k = 0; k < c_iterations; k++){
                float z_real2 = z_real * z_real;
                float z_imag2 = z_imag * z_imag;
                if (z_real2 + z_imag2 > 4){
                    isInSet = false;
                    break;
                }
                z_imag = 2 * z_real * z_imag + c_imag;
                z_real = z_real2 - z_imag2 + c_real;
            }
            if (isInSet)    image[y*c_cols+x] = 255;
            else            image[y*c_cols+x] = 0;

            x += blockDim.x * gridDim.x;
        }
        x = blockDim.x * blockIdx.x + threadIdx.x;
        y += blockDim.y * gridDim.y;
    }
}
4

1 回答 1

0

指令吞吐量在此处的编程指南中进行了描述

您还可以尝试使用此处clock()描述的本机函数测量一系列指令

编译器往往会掩盖源代码级别的实际操作计数(增加或可能减少明显的算术强度),因此如果您想准确识别机器正在做什么,您可能需要检查 ptx (nvcc -ptx ...)或者可能是机器汇编级代码,称为 SASS,您可以使用该cuobjdump实用程序从可执行文件中提取它。

于 2013-05-18T17:09:10.527 回答