我在 Visual Studio 2010 Professional 环境中的 Windows 7 64 位上运行 CUDA 4.2
首先,我运行以下代码:
// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using namespace std;
//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid==0)
d_bStopPtr[tid]=0;
else if(tid<count)
{
d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
if (d_bStopPtr[tid-1]==0 )
d_bStopPtr[tid]=0;
}
}
int main()
{
int count=100000;
// define the vectors
thrust::host_vector <int> h_a(count);
thrust::device_vector <int> d_b(count,0);
int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
thrust::device_vector <int> d_bStop(count,1);
int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int threadsPerBlock = prop.maxThreadsDim[0];
int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
while(d_bStop[count-1])
{
dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
}
//copy device back to host again
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
int x;
cin>>x;
return 0;
}
但是,每次我都需要检查while条件,但速度很慢。所以我正在考虑检查内核中这个设备向量的条件,并像这样更改代码:
// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using namespace std;
//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid==0)
d_bStopPtr[tid]=0;
else if(tid<count)
{
// if the last cell of the arrary is still not 0 yet, repeat
while(d_bStopPtr[count-1])
{
d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
if (d_bStopPtr[tid-1]==0 )
d_bStopPtr[tid]=0;
}
}
}
int main()
{
int count=100000;
// define the vectors
thrust::host_vector <int> h_a(count);
thrust::device_vector <int> d_b(count,0);
int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
thrust::device_vector <int> d_bStop(count,1);
int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int threadsPerBlock = prop.maxThreadsDim[0];
int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
//copy device back to host again
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
int x;
cin>>x;
return 0;
}
但是,第二个版本总是会导致显卡和电脑死机。你能帮我加快第一个版本的速度吗?如何检查内核内部的条件然后跳出并停止内核?