我写了几个内核函数,想知道处理这些函数需要多少毫秒。
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define N 8000
void fillArray(int *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() % 100;
}
__global__ void add(int* a, int *b) {
int add = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
add = a[tid] + b[tid];
}
}
__global__ void subtract(int* a, int *b) {
int subtract = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
subtract = a[tid] - b[tid];
}
}
__global__ void multiply(int* a, int *b) {
int multiply = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
multiply = a[tid] * b[tid];
}
}
__global__ void divide(int* a, int *b) {
int divide = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
divide = a[tid] / b[tid];
}
}
__global__ void modu(int* a, int *b) {
int modulus = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
modulus = a[tid] % b[tid];
}
}
__global__ void neg(int *data) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
data[tid] = -data[tid];
}
}
float duration(int *devA, int *devB, int blocksPerGrid, int threadsPerBlock) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
neg<<<blocksPerGrid, threadsPerBlock>>>(devA);
neg<<<blocksPerGrid, threadsPerBlock>>>(devB);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;
}
int main(void) {
int a[N], b[N];
float dur = 0;
int *devA, *devB;
cudaMalloc((void**) &devA, N * sizeof(int));
cudaMalloc((void**) &devB, N * sizeof(int));
fillArray(a, N);
fillArray(b, N);
cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devA, b, N * sizeof(int), cudaMemcpyHostToDevice);
dur = duration(a, b, N, 1);
cout << "Global memory version:\n";
cout << "Process completed in " << dur;
cout << " for a data set of " << N << " integers.";
return 0;
}
毫秒总是返回零。为什么?我在这里缺少什么?如果 ai 从持续时间函数中删除否定函数。它返回 0.15687 毫秒。我认为处理这些功能的数量很少。那个程序有什么问题?
编辑后,我这样做了:
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int N = 8000;
void fillArray(int *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() % 100;
}
__global__ void add(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
__global__ void subtract(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] - b[tid];
}
}
__global__ void multiply(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] * b[tid];
}
}
__global__ void divide(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] / b[tid];
}
}
__global__ void modu(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] % b[tid];
}
}
__global__ void neg(int *data, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = -data[tid];
}
}
float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
double hArrayC[N];
add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;
}
int main(void) {
int a[N], b[N],c[N];
float dur = 0;
int *devA, *devB,*devC;
cudaMalloc((void**) &devA, N * sizeof(int));
cudaMalloc((void**) &devB, N * sizeof(int));
cudaMalloc((void**) &devC, N * sizeof(int));
fillArray(a, N);
fillArray(b, N);
cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devC, c, N * sizeof(int), cudaMemcpyHostToDevice);
dur = duration(devA, devB, devC,N, 1);
cout << "Global memory version:\n";
cout << "Process completed in " << dur;
cout << " for a data set of " << N << " integers.";
cudaFree(devA);
cudaFree(devB);
return 0;
}