根据我对 CUDA C 的理解,每个线程都会执行等式的一个实例。但是我如何打印出所有的值。该代码确实有效,但确实需要有人帮我检查它,以确认我的结果实际上与我打算设计的内容一致。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <conio.h>
#include <cuda.h>
#include <cutil.h>
__global__ void my_compute(float *y_d,float *theta_d,float *u_d)
{
int idx=threadIdx.x+blockIdx.x*gridDim.x;
for (idx=7;idx<1000;idx++)
{
y_d[idx]=theta_d[0]*y_d[idx-1]+theta_d[1]*y_d[idx-3]+
theta_d[2]*u_d[idx-5]*u_d[idx-4]+theta_d[3]+
theta_d[4]*u_d[idx-6]+theta_d[5]*u_d[idx-4]*y_d[idx-6]+
theta_d[6]*u_d[idx-7]+theta_d[7]*u_d[idx-7]*u_d[idx-6]+
theta_d[8]*y_d[idx-4]+theta_d[9]*y_d[idx-5]+
theta_d[10]*u_d[idx-4]*y_d[idx-5]+theta_d[11]*u_d[idx-4]*y_d[idx-2]+
theta_d[12]*u_d[idx-7]*u_d[idx-3]+theta_d[13]*u_d[idx-5]+
theta_d[14]*u_d[idx-4];
}
}
int main(void)
{
float y[1000];
FILE *fpoo;
FILE *u;
float theta[15];
float u_data[1000];
float *y_d;
float *theta_d;
float *u_d;
cudaEvent_t start,stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// memory allocation
cudaMalloc((void**)&y_d,1000*sizeof(float));
cudaMalloc((void**)&theta_d,15*sizeof(float));
cudaMalloc((void**)&u_d,1000*sizeof(float));
cudaEventRecord( start, 0 );
// importing data for theta and input of model//
fpoo= fopen("c:\\Fly_theta.txt","r");
u= fopen("c:\\Fly_u.txt","r");
for (int k=0;k<15;k++)
{
fscanf(fpoo,"%f\n",&theta[k]);
}
for (int k=0;k<1000;k++)
{
fscanf(u,"%f\n",&u_data[k]);
}
//NB: pls does this for loop below make my equation run 1000000
// instances as oppose to the 1000 instances i desire?
for (int i=0;i<1000;i++)
{
//i initialised the first 7 values of y because the equation output
//starts form y(8)
for (int k=0;k<8;k++)
{
y[k]=0;
cudaMemcpy(y_d,y,1000*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(theta_d,theta,15*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(u_d,u_data,1000*sizeof(float),cudaMemcpyHostToDevice);
//calling kernel function//
my_compute<<<200,5>>>(y_d,theta_d,u_d);
cudaMemcpy(y,y_d,1000*sizeof(float),cudaMemcpyDeviceToHost);
}
printf("\n\n*******Iteration %i*******\n", i);
//does this actually print all the values from the threads?
for(int i=0;i<1000;i++)
{
printf("%f",y[i]);
}
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf("Time to generate: %3.1f ms \n", time);
cudaFree(y_d);
cudaFree(theta_d);
cudaFree(u_d);
fclose(u);
fclose(fpoo);
//fclose();
_getche();
return (0);
}