当我在 cuda 上计算时,这是真的,但是当我使用更大的数字进行计算时,我必须将 TdrLevel 设置为
这个链接
。但是在设置 tdrlevel 之后,我得到了一个错误的结果。(-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080-431602080- 431602080-431602080-431602080-431602080-431602080-431602080-4316080-431602080-431602080-431602080-431602080-431602080-431602080208020802080208021616020802002020802161602161616161616161616020802020802020202020200202161602020200202602026020202026020260转
我不知道哪里有问题。我的块数是 512 ,每个块的线程数是 1024 。我希望找到我的意思。
that's my program code for fibunatchi program it work without tdrlevel but have above result by tdrlevel :
#include <stdio.h>
#include <cuda.h>
#include <dos.h>
__global__ void fibunat_array(float *a,int N )
{
for (int x=0; x< N; x += 1)
{
a[x]=0;
}
a[0]=1;a[1]=1;
for (int i=0; i< N; i += 1)
{
a[i+2]=a[i]+a[i+1];
}
}
int main( void )
{
time_t start,end;
double dif;
time ( &start );
float *a_h,*a_d;
const int N = 100;
size_t size = N * sizeof( float );
a_h = (float *)malloc( size );
cudaMalloc( (void **)&a_d, size );
cudaMemcpy( a_d, a_h, size, cudaMemcpyHostToDevice );
int block_size = 9<<1;
int n_blocks = (N+ block_size-1) /block_size;
square_array <<< n_blocks , block_size >>> ( a_d, N );
cudaMemcpy( a_h, a_d, sizeof( float ) * N, cudaMemcpyDeviceToHost );
for (int i = 0; i<N/3+10 ; i++)
printf( "%d ",(int)a_h[i] );
free( a_h );
cudaFree( a_d );
time ( &end );
dif=difftime(end,start);
printf ( "\n\n");
printf ( "total time for this calculate is : %d second\n\n",(int)dif);
}