我想知道是否有人可以通过内核中的 new 运算符阐明这种行为。以下是代码
#include <stdio.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
using namespace std;
__global__ void test()
{
cuComplex *store;
store= new cuComplex[30000];
if (store==NULL) printf("Unable to allocate %i\n",blockIdx.y);
delete store;
if (threadIdx.x==10000) store->x=0.0;
}
int main(int argc, char *argv[])
{
float timestamp;
cudaEvent_t event_start,event_stop;
// Initialise
cudaEventCreate(&event_start);
cudaEventCreate(&event_stop);
cudaEventRecord(event_start, 0);
dim3 threadsPerBlock;
dim3 blocks;
threadsPerBlock.x=1;
threadsPerBlock.y=1;
threadsPerBlock.z=1;
blocks.x=1;
blocks.y=500;
blocks.z=1;
cudaEventRecord(event_start);
test<<<blocks,threadsPerBlock,0>>>();
cudaEventRecord(event_stop, 0);
cudaEventSynchronize(event_stop);
cudaEventElapsedTime(×tamp, event_start, event_stop);
printf("test took %fms \n", timestamp);
}
在 GTX680 Cuda 5 上运行此程序并调查输出会注意到未分配随机内存 :( 我在想可能是因为所有全局内存都已完成但我有 2GB 内存并且因为活动块的最大数量是16 使用这种方法分配的内存量最大应该是 16*30000*8=38.4x10e6.. 即大约 38Mb。那我还应该考虑什么?