我刚开始学习 CUDA,我被一个基本概念困住了:网格。我读过网格只是块的逻辑集合(?),但我无法在脑海中创建场景图片。我对线程和块有一个清晰的认识,并且知道它们与物理 GPU 的关系。块转到核心,线程转到流处理器。但是网格在哪里适合这张图片?
一些类比将受到赞赏,并使理解更容易。
Ps-我正在向 udacity 学习。
#include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h>
__global__ void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int x,y,i; // i is index for 1D array greyImage. x and y for rgbaImage
i = (blockIdx.y * blockDim.x) + blockIdx.x;
x= (blockIdx.x * blockDim.x) + threadIdx.x;
y= (blockIdx.y * blockDim.y) + threadIdx.y;
if(x < numCols && y < numRows)
{
greyImage[i] = (0.299f * rgbaImage[y].x) + (0.587f * rgbaImage[y].y) + (0.114f * rgbaImage[y].z);
}
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(10, 10, 1); //TODO
size_t gridSizeX, gridSizeY;
gridSizeX = numCols + (10 - (numCols % 10) ); //adding some number to make it multiple of 10
gridSizeY = numRows + (10 - (numRows % 10) ); //adding some number to make it multiple of 10
const dim3 gridSize( gridSizeX, gridSizeY, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}