我对生成直方图的简单 CUDA 代码有疑问:
__#include <math.h>
#include <numeric>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
__global__ void kernel_histogram(int* dev_histogram, int* dev_values_arr, unsigned int size) {
__shared__ int temp[BLOCK_SIZE + 1];
int thread_id, thread_value;
thread_id = threadIdx.x + blockIdx.x * blockDim.x;
if (thread_id >= size) {
return;
}
temp[threadIdx.x + 1] = 0;
__syncthreads();
thread_value = dev_values_arr[thread_id];
atomicAdd(&temp[thread_value], 1);
__syncthreads();
atomicAdd(&(dev_histogram[threadIdx.x + 1]), temp[threadIdx.x + 1]);
}
int* histogram_cuda(int* values_arr, int size) {
int num_blocks = size / BLOCK_SIZE;
int* dev_histogram = 0;
int* dev_values_arr = 0;
int* histogram = (int*)malloc((BLOCK_SIZE + 1) * sizeof(int));
cudaError_t cudaStatus;
if (size % BLOCK_SIZE != 0) {
num_blocks = num_blocks + 1;
}
// allocate histogram and values_arr device memories
cudaStatus = cudaMalloc((void**)&dev_histogram,
(BLOCK_SIZE + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
cudaStatus = cudaMemset(dev_histogram, 0, (BLOCK_SIZE + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMemset() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
cudaStatus = cudaMalloc((void**)&dev_values_arr, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
// copy values_arr memory in host to device
cudaStatus = cudaMemcpy(dev_values_arr, values_arr, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
printf("the number of blocks is %d\n\n", num_blocks);
// calculate histogram on the gpu
kernel_histogram << <num_blocks, BLOCK_SIZE >> > (dev_histogram, dev_values_arr,
size);
// copy histogram memory in device to host
cudaStatus = cudaMemcpy(histogram, dev_histogram,
(BLOCK_SIZE + 1) * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
// free device memory
cudaFree(dev_histogram);
cudaFree(dev_values_arr);
return histogram;
}
int main(int argc, char* argv[]) {
unsigned int size = 21;
int* histogram;
int values_arr[] = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 };
histogram = histogram_cuda(values_arr, size);
for (int i = 1; i < BLOCK_SIZE + 1; i++) {
if (histogram[i] > 0) {
printf("%d : %d\n", i, histogram[i]);
}
}
}
直方图用于记录输入中存在的值的数量,允许的值为 1 到 256。每个块最多有 256 个线程。我试图限制跨块的总线程数,以便每个线程记录直方图中一个值的出现。
如果我使用“values_arr = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 }”这意味着大小是 21,我得到:
2:7 4:1 5:4 7:4 19:1 20:1 21:1
我正在尝试使每个值都由一个线程记录并处理所有无用的线程。此外,您发现的任何其他问题以及以最佳方式解决此问题的任何建议将不胜感激。谢谢!