我正在尝试为一个非常复杂的 CUDA 内核实现并发内核启动,所以我想我会从一个简单的例子开始。它只是启动一个内核来减少总和。很简单。这里是:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cuda.h>
extern __shared__ char dsmem[];
__device__ double *scratch_space;
__device__ double NDreduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
double *b;
b = scratch_space;
for (int i = tid; i < length; i+= blockDim.x)
b[i] = a[i];
__syncthreads();
do {
k = (k + 1) / 2;
if (tid < k && tid + k < length)
b[tid] += b[tid + k];
length = k;
__syncthreads();
} while (k > 1);
return b[0];
}
__device__ double reduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
do
{
k = (k + 1) / 2;
if (tid < k && tid + k < length)
a[tid] += a[tid + k];
length = k;
__syncthreads();
}
while (k > 1);
return a[0];
}
__global__ void kernel_thing(double *ad, int size)
{
double sum_1, sum_2, sum_3;
time_t begin, end, t1, t2, t3;
scratch_space = (double *) &dsmem[0];
for (int j = 0; j < 1000000; j++) {
begin = clock();
sum_1 = NDreduceSum(ad, size);
end = clock();
}
__syncthreads();
t1 = end - begin;
begin = clock();
sum_2 = 0;
if (threadIdx.x == 0) {
for (int i = 0; i < size; i++) {
sum_2 += ad[i];
}
}
__syncthreads();
end = clock();
t2 = end - begin;
__syncthreads();
begin = clock();
sum_3 = reduceSum(ad, size);
end = clock();
__syncthreads();
t3 = end - begin;
if (threadIdx.x == 0) {
printf("Sum found: %lf and %lf and %lf. In %ld and %ld and %ld ticks.\n", sum_1, sum_2, sum_3, t1, t2, t3);
}
}
int main(int argc, char **argv)
{
int i;
const int size = 512;
double *a, *ad, *b, *bd;
double sum_a, sum_b;
cudaStream_t stream_a, stream_b;
cudaError_t result;
cudaEvent_t a_start, a_stop, b_start, b_stop;
a = (double *) malloc(sizeof(double) * size);
b = (double *) malloc(sizeof(double) * size);
srand48(time(0));
for (i = 0; i < size; i++) {
a[i] = drand48();
}
for (i = 0; i < size; i++) {
b[i] = drand48();
}
sum_a = 0;
for (i = 0; i < size; i++) {
sum_a += a[i];
}
sum_b = 0;
for (i = 0; i < size; i++) {
sum_b += b[i];
}
printf("Looking for sum_a %lf\n", sum_a);
printf("Looking for sum_b %lf\n", sum_b);
cudaEventCreate(&a_start);
cudaEventCreate(&b_start);
cudaEventCreate(&a_stop);
cudaEventCreate(&b_stop);
cudaMalloc((void **) &ad, sizeof(double) * size);
cudaMalloc((void **) &bd, sizeof(double) * size);
result = cudaStreamCreate(&stream_a);
result = cudaStreamCreate(&stream_b);
result = cudaMemcpyAsync(ad, a, sizeof(double) * size, cudaMemcpyHostToDevice, stream_a);
result = cudaMemcpyAsync(bd, b, sizeof(double) * size, cudaMemcpyHostToDevice, stream_b);
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
cudaEventRecord(a_stop);
cudaEventRecord(b_start);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(b_stop);
result = cudaMemcpyAsync(a, ad, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_a);
result = cudaMemcpyAsync(b, bd, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_b);
cudaEventSynchronize(a_stop);
cudaEventSynchronize(b_stop);
float a_ms = 0;
float b_ms = 0;
cudaEventElapsedTime(&a_ms, a_start, a_stop);
cudaEventElapsedTime(&b_ms, b_start, b_stop);
printf("%lf ms for A.\n", a_ms);
printf("%lf ms for B.\n", b_ms);
result = cudaStreamDestroy(stream_a);
result = cudaStreamDestroy(stream_b);
if (result != cudaSuccess) {
printf("I should probably do this after each important operation.\n");
}
/*
printf("Matrix after:\n");
for (i = 0; i < size; i++) {
printf("%lf ", a[i]);
}
printf("\n");
*/
free(a);
free(b);
cudaFree(ad);
cudaFree(bd);
return 0;
}
编译如下:
CFLAGS = -arch sm_35
CC = nvcc
all: parallel
parallel: parallel.cu
$(LINK.c) $^ -o $@
clean:
rm -f *.o core parallel
我正在使用单个 Tesla K20X。
当我运行这个简单的例子时,我得到以下输出:
Looking for sum_a 247.983945
Looking for sum_b 248.033749
Sum found: 247.983945 and 247.983945 and 247.983945. In 3242 and 51600 and 4792 ticks.
Sum found: 248.033749 and 248.033749 and 248.033749. In 3314 and 52000 and 4497 ticks.
4645.079102 ms for A.
4630.725098 ms for B.
Application 577759 resources: utime ~8s, stime ~2s, Rss ~82764, inblocks ~406, outblocks ~967
因此,如您所见,每个内核都得到了正确的结果,大约需要 4.5 秒,这是我在较早的单内核版本中得到的结果。伟大的!但是,从 arun 输出中可以看出,挂墙时间实际上是 10 秒左右,这比单内核版本要多得多。因此,看起来内核要么没有并行启动,要么我几乎没有从并发内核启动中获得预期的加速(2 倍)。
tl;博士这个问题:
- 我的代码示例中是否缺少任何内容?内核实际上是并行启动的吗?
- Tesla K20X 我应该期待什么样的加速?内核不应该完全并行运行,同时完成两倍的工作吗?我可以期望有多少个内核可以有效地并行运行?
谢谢你的帮助。