我对以下程序(下面的代码)感到困惑。当内核中的两行定义specsin
和时,它工作正常并给出正确的结果speccos
(注意第二项,即sin(t)
):
specsin+=sin(pi*t/my_tau)*sin(t)*sin(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
speccos+=sin(pi*t/my_tau)*sin(t)*cos(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
一旦我将第二sin(t)
项更改为sin(t+0.0*my_a0*my_a0)
,这不应该改变结果,我得到全零而不是正确答案。
难道是我的内核内存用完了?
#include <stdio.h>
__global__ void Calculate_Spectrum(float * d_Detector_Data, int numCols, int numRows,
const float omega_min, float dOmega,
const float a0_min, float da0,
const float tau_min, float dtau, float dt)
{
int Global_x = blockIdx.x * blockDim.x + threadIdx.x;
int Global_y = blockIdx.y * blockDim.y + threadIdx.y;
int Position1D = Global_y * numCols + Global_x;
float my_omega=omega_min + Global_x * dOmega;
float my_a0=a0_min + Global_y*da0;
float my_tau=tau_min;
int total_time_steps=int(my_tau/dt);
float specsin=0.0;
float speccos=0.0;
float t=0.0;
float pi=3.14159265359;
for(int n=0; n<total_time_steps; n++)
{
t=n*dt;
specsin+=sin(pi*t/my_tau)*sin(t+0.0*my_a0*my_a0)*sin(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
speccos+=sin(pi*t/my_tau)*sin(t+0.0*my_a0*my_a0)*cos(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
}
d_Detector_Data[Position1D]=(specsin*specsin+speccos*speccos)*dt*dt*my_a0*my_a0*my_omega*my_omega/4.0/pi/pi;
}
int main(int argc, char ** argv)
{
const int omega_bins = 1024;
const int a0_bins = 512;
const int tau_bins = 1;
const float omega_min = 0.5;
const float omega_max = 1.1;
const float a0_min = 0.05;
const float a0_max = 1.0;
const float tau_min = 1200;
const float tau_max = 600;
const int steps_per_period=20; // for integrating
float dt=1.0/steps_per_period;
int TotalSize = omega_bins * a0_bins * tau_bins;
float dOmega=(omega_max-omega_min)/(omega_bins-1);
float da0=(a0_max-a0_min)/(a0_bins-1);
float dtau=0.;
float * d_Detector_Data;
int * d_Global_x;
int * d_Global_y;
float h_Detector_Data[TotalSize];
// allocate GPU memory
cudaMalloc((void **) &d_Detector_Data, TotalSize*sizeof(float));
Calculate_Spectrum<<<dim3(1,a0_bins,1), dim3(omega_bins,1,1)>>>(d_Detector_Data, omega_bins, a0_bins, omega_min, dOmega, a0_min, da0, tau_min, dtau, dt);
cudaMemcpy(h_Detector_Data, d_Detector_Data, TotalSize*sizeof(float), cudaMemcpyDeviceToHost);
FILE * SaveFile;
char TempStr[255];
sprintf(TempStr, "result.dat");
SaveFile = fopen(TempStr, "w");
int counter=0;
for(int j=0; j<a0_bins;j++)
{
for(int i=0; i<omega_bins; i++)
{
fprintf(SaveFile,"%e\t", h_Detector_Data[counter]);
counter++;
}
fprintf(SaveFile, "\n");
}
fclose(SaveFile);
// free GPU memory
return 0;
}