using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int threadsPerBlock = 256;
const int N = 40000;
void generateArray(double *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() / ((rand() + rand()) / 2.0 + 1);
}
double maxCPU(double *arr, int count) {
int max = arr[0];
for (int i = 0; i < count; i++)
if (arr[i] > max)
max = arr[i];
return max;
}
__global__ void MaxGPU(double *a, int count, double *result){
__shared__ double cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
int temp = a[tid];
tid+= blockDim.x * gridDim.x;
while(tid < count){
if(a[tid] > temp)
temp = a[tid];
tid+= blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i!=0){
if(cacheIndex < i)
if(cache[cacheIndex + i] > cache[cacheIndex])
cache[cacheIndex] = cache[cacheIndex + i];
__syncthreads();
i/=2;
}
if(cacheIndex == 0)
result[blockIdx.x] = cache[0];
}
int main(void) {
double *arr = new double[N], resultGPU;
generateArray(arr, N);
double *devA, *dev_partial_result;
double resultCPU = maxCPU(arr, N);
cudaMalloc((void**)&devA, N * sizeof(double));
cudaMalloc((void**)&dev_partial_result, 512 * sizeof(double));
cudaMemcpy(devA, arr, N * sizeof(double), cudaMemcpyHostToDevice);
MaxGPU<<<1, 256>>>(devA, N, dev_partial_result);
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost);
cout << "Max CPU: " << resultCPU << endl;
cout << "Max GPU: " << resultGPU << endl;
cudaFree(devA);
cudaFree(dev_partial_result);
delete [] arr;
return 0;
}
我写了上面的代码。我不知道为什么,但它只适用于一个块。它不适用于 256 或 512 块。为什么?怎么了?