我有一个内核,它计算总和。如果我通过内核计算声明的变量数量,我会假设每个内核总共有 5 个寄存器*。然而,在分析内核时,使用了 34 个寄存器。我需要减少到 30 个寄存器以允许执行 1024 个线程。
任何人都可以看到有什么问题吗?
__global__ void sum_kernel(float* values, float bk_size, int start_idx, int end_idx, int resolution, float* avgs){
    // Allocate shared memory (assuming a maximum of 1024 threads).
    __shared__ float sums[1024];
    // Boundary check.
    if(blockIdx.x == 0){
        avgs[blockIdx.x] = values[start_idx];
        return;
    }
    else if(blockIdx.x == resolution-1) {
        avgs[blockIdx.x] = values[start_idx+(end_idx-start_idx)-1];
        return;
    }
    else if(blockIdx.x > resolution -2){
        return;
    }
    // Iteration index calculation.
    unsigned int idx_prev = floor((blockIdx.x + 0) * bk_size) + 1;
    unsigned int from = idx_prev + threadIdx.x*(bk_size / blockDim.x);
    unsigned int to = from + (bk_size / blockDim.x);
    to = (to < (end_idx-start_idx))? to : (end_idx-start_idx);
    // Partial average calculation using shared memory.
    sums[threadIdx.x] = 0;
    for (from; from < to; from++)
    {
        sums[threadIdx.x] += values[from+start_idx];
    }
    __syncthreads();
    // Addition of partial sums.
    if(threadIdx.x != 0) return;
    from = 1;
    for(from; from < 1024; from++)
    {
        sum += sums[from];
    }
    avgs[blockIdx.x] = sum;
}
- 假设每个指针有 2 个寄存器,每个 unsigned int 有 1 个寄存器,参数存储在常量内存中。