c++ - 2D CUDA 中值滤波器优化

Question

我在 CUDA 中实现了一个 2D 中值滤波器，整个程序如下所示。

#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>  
#include <fstream>   
#include <iomanip>   
#include <windows.h>
#include <io.h>                  
#include <stdio.h>
#include<conio.h>
#include <cstdlib>
#include "cstdlib"
#include <process.h>
#include <stdlib.h>
#include <malloc.h>
#include <ctime>
using namespace std;

#define MEDIAN_DIMENSION  3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9......   
#define MEDIAN_LENGTH 9   // Shoul be  MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3

#define BLOCK_WIDTH 16  // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]

 __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){

      __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH];

    int iterator;
    const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1;
    int StartPoint=MEDIAN_DIMENSION/2;
    int EndPoint=StartPoint+1;

    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;

    const int tid=threadIdx.y*blockDim.y+threadIdx.x;   

      if(x>=Image_Width || y>=Image_Height)
        return;

     //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION
            if (x == 0 || x == Image_Width - StartPoint || y == 0
                || y == Image_Height - StartPoint) {
            } else {             
                iterator = 0;
                for (int r = x - StartPoint; r < x + (EndPoint); r++) {
                    for (int c = y - StartPoint; c < y + (EndPoint); c++) {
                        surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r);
                        iterator++;
                    }
                }
//Sort the Surround Array to Find Median. Use Bubble Short  if Matrix oF 3 x 3 Matrix 
                    //You can use Insertion commented below to Short Bigger Dimension Matrix  

                              ////      bubble short //

                    for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i)
                    {     
                        // Find position of minimum element
                        int min=i;
                        for ( int l=i+1; l<MEDIAN_LENGTH; ++l)
                            if (surround[tid][l] <surround[tid][min] )
                                min=l;
                        // Put found minimum element in its place
                        unsigned short  temp= surround[tid][i];
                        surround[tid][i]=surround[tid][min];
                        surround[tid][min]=temp;
                    }//bubble short  end

                    //////insertion sort start   //

                    /*int t,j,i;
                    for ( i = 1 ; i< MEDIAN_LENGTH ; i++) {
                        j = i;
                        while ( j > 0 && surround[tid][j] < surround[tid][j-1]) {
                            t= surround[tid][j];
                            surround[tid][j]= surround[tid][j-1];
                            surround[tid][j-1] = t;
                            j--;
                        }
                    }*/

                    ////insertion sort end   



                    *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1];   // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix
                        __syncthreads();
            }  
}

  int main( int argc, const char** argv )
{
    int dataLength;
    int p1;
    unsigned short* Host_ImageData = NULL;
    ifstream is; // Read File 
    is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary );

    // get length of file:
    is.seekg (0, ios::end);
    dataLength = is.tellg();
    is.seekg (0, ios::beg);

    Host_ImageData = new  unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read ((char*)Host_ImageData,dataLength);
    is.close();

    int Image_Width = 1580;
    int Image_Height = 1050;

    unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength);
    unsigned short *Device_ImageData = NULL;

    /////////////////////////////
    // As First time cudaMalloc take more time  for memory alocation, i dont want to cosider this time in my process. 
    //So Please Ignore Code For Displaying First CudaMelloc Time
    clock_t begin = clock();
    unsigned short *forFirstCudaMalloc = NULL;
    cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) );
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"First CudaMelloc time = "<<elapsed_secs<<"  Second\n" ;
    cudaFree( forFirstCudaMalloc );
    ////////////////////////////

    //Actual Process Starts From Here 
    clock_t beginOverAll = clock();   //
    cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) ); 
    cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering

    int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH));
    int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT));

    const dim3 grid (x, y, 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    begin = clock();

    MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height);
    cudaDeviceSynchronize();

    end = clock();
    elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"Process time = "<<elapsed_secs<<"  Second\n" ;

    cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done

    clock_t endOverall = clock();
    elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC;
    cout<<"Complete Time  = "<<elapsed_secs<<"  Second\n" ;

    ofstream of2;   //Write Filtered Image Into File
    of2.open("D:\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Host_ResultData,dataLength);
    of2.close();
    cout<<"\nEnd of Writing File.  Press Any Key To Exit..!!";
    cudaFree(Device_ImageData);
    delete Host_ImageData;
    delete Host_ResultData;

    getch();
    return 0;
}

这是我使用的文件的链接。我使用ImajeJ以“原始”格式存储图像，读取“原始”图像也是如此。我的图像像素16有点，unsigned short. 图像的宽度为1580，高度为1050。

我坚信通过使用适当的 CUDA 优化可以使过滤器更加高效和快速。

事实上，我在 GeForce GT 520M 卡上运行，时间如下

1）MEDIAN_DIMENSION对于3 x 3 = 0.027 seconds

2）MEDIAN_DIMENSION对于5 x 5 = 0.206 seconds

3）MEDIAN_DIMENSION对于7 x 7 = 1.11 seconds

4）MEDIAN_DIMENSION对于9 x 9 = 4.931 seconds

正如你所看到的，随着我们的增加MEDIAN_DIMENSION，时间增加了很多，我有一些应用程序，我通常使用更高的MEDIAN_DIMENSIONlike7 x 7和9 x 9。我认为，通过使用 Cuda，即使9 x 9时间应该少于1 second.

既然我认为排序部分在这里占用了大部分时间，我们可以让算法的排序部分更快吗？

我们可以grid更block有效地使用吗？我可以使用更大的BLOCK_WIDTHand BLOCK_HEIGHT（比如32and 32）并且仍然没有达到我设备的最大__shared__内存限制吗？4Kb

__shared__内存可以更有效地使用吗？

任何帮助将不胜感激。

提前致谢。

score 4 · Accepted Answer

我正在回答你关于使用共享内存的最后一个问题。

正如 Eric 已经注意到的，您对共享内存的使用并没有真正导致线程协作。

对于这种情况，我将您的解决方案3x3与根本不使用共享内存的内核变体以及CUDA 中的 2D 中值过滤中讨论的 Accelereyes 解决方案进行比较：如何有效地将全局内存复制到共享内存。

这是完整的代码：

#include <iostream>  
#include <fstream>   

using namespace std;

#define BLOCK_WIDTH 16 
#define BLOCK_HEIGHT 16

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
    const int tx_l = threadIdx.x;                           // --- Local thread x index
    const int ty_l = threadIdx.y;                           // --- Local thread y index

    const int tx_g = blockIdx.x * blockDim.x + tx_l;        // --- Global thread x index
    const int ty_g = blockIdx.y * blockDim.y + ty_l;        // --- Global thread y index

    __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];

    // --- Fill the shared memory border with zeros
    if (tx_l == 0)                      smem[tx_l]  [ty_l+1]    = 0;    // --- left border
    else if (tx_l == BLOCK_WIDTH-1)     smem[tx_l+2][ty_l+1]    = 0;    // --- right border
    if (ty_l == 0) {                    smem[tx_l+1][ty_l]      = 0;    // --- upper border
        if (tx_l == 0)                  smem[tx_l]  [ty_l]      = 0;    // --- top-left corner
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]      = 0;    // --- top-right corner
        }   else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2]  = 0;    // --- bottom border
        if (tx_l == 0)                  smem[tx_l]  [ty_l+2]    = 0;    // --- bottom-left corder
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2]    = 0;    // --- bottom-right corner
    }

    // --- Fill shared memory
                                                                    smem[tx_l+1][ty_l+1] =                           Input_Image[ty_g*Image_Width + tx_g];      // --- center
    if ((tx_l == 0)&&((tx_g > 0)))                                      smem[tx_l]  [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];      // --- left border
    else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))         smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];      // --- right border
    if ((ty_l == 0)&&(ty_g > 0)) {                                      smem[tx_l+1][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g];    // --- upper border
            if ((tx_l == 0)&&((tx_g > 0)))                                  smem[tx_l]  [ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- top-left corner
            else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g+1];  // --- top-right corner
         } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) {  smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g];    // --- bottom border
         if ((tx_l == 0)&&((tx_g > 0)))                                 smem[tx_l]  [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- bottom-left corder
        else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1];  // --- bottom-right corner
    }
    __syncthreads();

    // --- Pull the 3x3 window in a local array
    unsigned short v[9] = { smem[tx_l][ty_l],   smem[tx_l+1][ty_l],     smem[tx_l+2][ty_l],
                            smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1],   smem[tx_l+2][ty_l+1],
                            smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2],   smem[tx_l+2][ty_l+2] };    

    // --- Bubble-sort
    for (int i = 0; i < 5; i++) {
        for (int j = i + 1; j < 9; j++) {
            if (v[i] > v[j]) { // swap?
                unsigned short tmp = v[i];
                v[i] = v[j];
                v[j] = tmp;
            }
         }
    }

    // --- Pick the middle one
    Output_Image[ty_g*Image_Width + tx_g] = v[4];
}

/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill shared memory
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[tid][iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort shared memory to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[tid][i];
        surround[tid][i]=surround[tid][minval];
        surround[tid][minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[tid][4]; 

    __syncthreads();

}

/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    unsigned short surround[9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill array private to the threads
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort private array to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[i];
        surround[i]=surround[minval];
        surround[minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[4]; 

}

/********/
/* MAIN */
/********/
int main()
{
    const int Image_Width = 1580;
    const int Image_Height = 1050;

    // --- Open data file
    ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );

    // --- Get file length
    is.seekg(0, ios::end);
    int dataLength = is.tellg();
    is.seekg(0, ios::beg);

    // --- Read data from file and close file
    unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read((char*)Input_Image_Host,dataLength);
    is.close();

    // --- CUDA warm up
    unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
    gpuErrchk(cudaFree(forFirstCudaMalloc));

    // --- Allocate host and device memory spaces 
    unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
    unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short))); 
    unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short))); 

    // --- Copy data from host to device
    gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering

    // --- Grid and block sizes
    const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    /****************************/
    /* ORIGINAL KERNEL FUNCTION */
    /****************************/
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
    Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - elapsed time:  %3.3f ms \n", time);

    /***********************************************/
    /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
    /***********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
    Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - no shared - elapsed time:  %3.3f ms \n", time);

    /**********************************************/
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
    /**********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
    Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Optimized kernel function - shared - elapsed time:  %3.3f ms \n", time);

    // --- Copy results back to the host
    gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));

    // --- Open results file, write results and close the file
    ofstream of2;     of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Output_Image_Host, dataLength);
    of2.close();

    cout << "\n Press Any Key To Exit..!!";
    gpuErrchk(cudaFree(Input_Image));

    delete Input_Image_Host;
    delete Output_Image_Host;

    return 0;
}

以下是开普勒 K20c的计时结果：

1580 x 1050
Original_Kernel_Function             = 1.588ms
Original_Kernel_Function_no_shared   = 1.278ms
Optimized_Kernel_Function_shared     = 1.455ms

2048 x 2048
Original_Kernel_Function             = 3.94ms
Original_Kernel_Function_no_shared   = 3.118ms
Optimized_Kernel_Function_shared     = 3.709ms

4096 x 4096
Original_Kernel_Function             = 16.003ms
Original_Kernel_Function_no_shared   = 13.735ms
Optimized_Kernel_Function_shared     = 14.526ms

8192 x 8192
Original_Kernel_Function             = 62.278ms
Original_Kernel_Function_no_shared   = 47.484ms
Optimized_Kernel_Function_shared     = 57.474ms

这是GT540M上的计时结果，它与您的卡更相似：

1580 x 1050
Original_Kernel_Function             = 10.332 ms
Original_Kernel_Function_no_shared   =  9.294 ms
Optimized_Kernel_Function_shared     = 10.301 ms

2048 x 2048
Original_Kernel_Function             = 25.256 ms
Original_Kernel_Function_no_shared   = 23.567 ms
Optimized_Kernel_Function_shared     = 23.876 ms

4096 x 4096
Original_Kernel_Function             = 99.791 ms
Original_Kernel_Function_no_shared   = 93.919 ms
Optimized_Kernel_Function_shared     = 95.464 ms

8192 x 8192
Original_Kernel_Function             = 399.259 ms
Original_Kernel_Function_no_shared   = 375.634 ms
Optimized_Kernel_Function_shared     = 383.121 ms

可以看出，在所有情况下，不使用共享内存的版本似乎（稍微）方便。

score 1 · Accepted Answer

似乎您在使用共享内存的线程之间没有共享任何内容，即对于 3x3 过滤器，您从全局内存中读取每个像素 9 次，这是没有必要的。本白皮书可能会提供一些关于如何在卷积核中使用共享内存的想法。希望它有所帮助。

http://docs.nvidia.com/cuda/samples/3_Imaging/convolutionSeparable/doc/convolutionSeparable.pdf

score 1 · Accepted Answer

快速选择中位数是最佳情况下最快的线性时间算法；但是，由于内存开销，很难在 CUDA 中实现。高度并行算法的最简单方法是最小化内存开销。代替像 Quickselect 这样的部分排序算法，使用 Torben 中值算法完全消除内存开销。Torben 算法可能比其他算法慢得多，但它不会修改输入数据。因此，无需分配共享内存。

最后，为了获得最大速度，将输入绑定到纹理，这具有管理边界扩展的额外好处。为了最大限度地减少缓存未命中，请对行主数组使用嵌套的 for 循环主行。

score 0 · Accepted Answer

两个提示：

更改双循环迭代的顺序：首先在 y（外循环）中迭代，然后在 x（内循环）中迭代。这是最重要的修复，因为它适用于您将实现的任何双循环。您想确保连续读取尽可能接近（在不同的设备上有不同的原因，例如，对于单线程 CPU 代码，它对缓存最重要，在 GPU 上，它对合并内存访问最重要，也许缓存）。由于您首先迭代行，因此您现在有 0 个合并，有效地为每个像素发送一个 2 字节（像素宽度）请求。请务必阅读有关此事的这个帖子，即使它只解释了 CPU 方面的事情。
确保您的读取已合并。在您的示例中，即使您修复了循环，您也只会读取block_width * pixel_width，即一次 16 x 2 = 32 个连续字节。这将需要比一次读取 128 个字节更高的延迟隐藏占用率。您可以通过使用更宽的块来改进事情（出于这个原因，通常更宽的块更好）。还要确保你的读数是对齐的。这与前一点有关，并在 [CUDA C 编程指南的这一部分] [7] 中进行了解释。

编辑：我把剩下的答案移到这里。

c++ - 2D CUDA 中值滤波器优化

4 回答 4

Related

Reference