我想将两个数组相乘,但 Cuda RunTime API 有问题。出现错误:cuda 运行时 API 错误 11:无效参数。论据是什么?请帮帮我 代码:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <fstream>
// includes, project
#include <cufft.h>
#include <cuda.h>
#include <cutil_inline.h>
#include <shrQATest.h>
#define SIGNAL_SIZE 8192
#define SIGNAL_COUNT 384
// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __device__ __host__ inline Complex ComplexAdd(Complex );
static __global__ void ComplexPointwiseMA(Complex* , Complex* , Complex* );
// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a,Complex b)
{
Complex c;
c.x = a.x + b.x;
c.y = a.y + b.y;
return c;
}
// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex Mul 'n Add function
static __global__ void ComplexPointwiseMA(Complex* mas1, Complex* mas2, Complex*mas_tmp)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
int j=0, k=0;
Complex c;
int size=SIGNAL_COUNT*SIGNAL_SIZE;
for (int i = threadID; i <size; i += numThreads)
{
if((i%SIGNAL_SIZE==0)&&(j<SIGNAL_COUNT))
j++;
c=ComplexMul(mas1[i], mas2[j]);
mas_tmp[k]=ComplexAdd(mas_tmp[k],c);
if(k>=SIGNAL_SIZE-1)
k=0;
else
k++;
}
}
int main(int argc, char** argv)
{
int threads_per_block=256;
int blocks_per_grid=16;
//dim3 dimBlock(SIGNAL_COUNT, SIGNAL_SIZE); // threads
//dim3 dimGrid(SIGNAL_COUNT); // 384 blocks in a grid
cudaEvent_t start, stop;
float elapsedTime;
//shrQAStart(argc, argv);
cudaEventCreate(&start);
cudaEventCreate(&stop);
// For random
srand(1001);
// Allocate host memory for the mass
Complex* mas_tmp=NULL;
mas_tmp =new Complex [SIGNAL_COUNT*SIGNAL_SIZE];
Complex* mas1=NULL;
mas1 =new Complex [SIGNAL_COUNT*SIGNAL_SIZE];
for(unsigned int i = 0; i <SIGNAL_COUNT*SIGNAL_SIZE; ++i)
{
mas1[i].x = rand() / (float)RAND_MAX;
mas1[i].y = rand() / (float)RAND_MAX;
}
Complex* mas2=NULL;
mas2 =new Complex [SIGNAL_COUNT];
for(unsigned int i = 0; i < SIGNAL_COUNT; ++i)
{
mas2[i].x = rand() / (float)RAND_MAX;
mas2[i].y = rand() / (float)RAND_MAX;
}
// Timer
cudaEventRecord(start,0);
// Allocate device memory for mass
Complex* mastmp_=NULL; // temporary
cutilSafeCall(cudaMalloc(&mastmp_, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex)));
Complex* mas1_=NULL; // signal
cutilSafeCall(cudaMalloc(&mas1_, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex)));
Complex* mas2_=NULL; // coefficient
cutilSafeCall(cudaMalloc(&mas2_, SIGNAL_COUNT*sizeof(Complex)));
// Copy host data to device
cutilSafeCall(cudaMemcpy(mas1_, mas1, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(mas2_, mas2, SIGNAL_COUNT*sizeof(Complex), cudaMemcpyHostToDevice));
// Calling Mul 'n Add function
ComplexPointwiseMA<<<blocks_per_grid, threads_per_block>>>(mas1_, mas2_, mastmp_);
// Check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed [ ComplexPointwiseMA ]");
// Copy device memory to host
cutilSafeCall(cudaMemcpy(mastmp_, mas_tmp, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost));
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start,stop);
printf("Time %3.10f ms\n", elapsedTime);
// Free memory
free(mas1);
free(mas2);
free(mas_tmp);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cutilSafeCall(cudaFree(mas1_));
cutilSafeCall(cudaFree(mas2_));
cutilSafeCall(cudaFree(mastmp_));
cutilDeviceReset();
system ("pause");
return 0;
}