我可能正在做一些非常愚蠢的事情,但我似乎无法使这种减少工作(可能已经有一个库可以做到这一点,但这是为了自学,所以请多多包涵)。我试图通过采用中位数方法来找到整数条目数组的中位数,我在下面进行了编码:
__global__ void gpuMedOdd(int *entries, int *med) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = entries[i];
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
我将此内核函数调用为:
gpuMedOdd<<<9, numEntries / 9>>>(d_entries, d_med);
然后我将 d_med 中的值复制到 med 中并打印出该值。不幸的是,无论输入如何,该值始终为 0。我究竟做错了什么?
编辑:我忘了提,swapGpu(a, b) 定义如下:
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
Edit2:如下所示,这是完整的代码。
#include <iostream>
#include <fstream>
#include <cstdlib>
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
cudaError_t err = cudaGetLastError();
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
int cpuMin(int *entries, int numEntries) {
int minVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] < minVal)
minVal = entries[i];
return minVal;
}
int cpuMax(int *entries, int numEntries) {
int maxVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] > maxVal)
maxVal = entries[i];
return maxVal;
}
inline void swap(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;
if(i + 2 * blockDim.x < numEntries) {
int list[3];
list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s && tid + 2 * s < blockDim.x) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
__global__ void gpuMin(int *entries, int *min, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] < entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] < sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*min = sdata[0];
}
__global__ void gpuMax(int *entries, int *max, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] > entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] > sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*max = sdata[0];
}
int partition(int *entries, int left, int right, int pivotIdx) {
int i, storeIdx = left, pivot = entries[pivotIdx];
swap(entries[pivotIdx], entries[right]);
for(i = left; i < right; i++)
if(entries[i] < pivot) {
swap(entries[i], entries[storeIdx]);
storeIdx++;
}
return storeIdx;
}
int cpuSelect(int *entries, int left, int right, int k) {
if(left == right)
return entries[left];
int pivotIdx = ((left + right) >> 2) + 1, pivotNewIdx, pivotDist;
pivotNewIdx = partition(entries, left, right, pivotIdx);
pivotDist = pivotNewIdx - left + 1;
if(pivotDist == k)
return entries[pivotNewIdx];
else if(k < pivotDist)
return cpuSelect(entries, left, pivotNewIdx - 1, k);
else
return cpuSelect(entries, pivotNewIdx + 1, right, k - pivotDist);
}
int main(int argc, char *argv[]) {
if(argc != 3) {
std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
exit(1);
}
std::ifstream inp(argv[1]);
if(!inp.is_open()) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Could not find file " << argv[1] << std::endl;
exit(2);
}
int numEntries = atoi(argv[2]), i = 0;
int *entries = new int[numEntries];
while(inp >> entries[i] && i < numEntries)
i++;
if(i < numEntries) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
exit(2);
}
if(inp >> i) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
exit(2);
}
int min, max;
int *d_entries, *d_min, *d_max;
checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
checkCudaErrors(cudaMalloc(&d_min, sizeof(int)));
checkCudaErrors(cudaMalloc(&d_max, sizeof(int)));
checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));
gpuMin<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_min, numEntries);
getLastCudaError("kernel launch failure");
gpuMax<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_max, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
if(numEntries % 2) {
int med, *d_med;
checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The median value is: " << med << std::endl;
}
else {
int *d_med;
cudaMalloc(&d_med, sizeof(int));
gpuMedOdd<<<16, numEntries / 16>>>(d_entries, d_med, numEntries);
}
min = cpuMin(entries, numEntries);
max = cpuMax(entries, numEntries);
if(numEntries % 2) {
int median = cpuSelect(entries, 0, numEntries - 1, (numEntries - 1) / 2 + 1);
std::cout << "The median value is: " << median << std::endl;
}
else {
int med2 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2);
int med1 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2 + 1);
float median = 0.5 * (med1 + med2);
std::cout << "The median value is: " << median << std::endl;
}
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
exit(0);
}