c++ - inexplicable cuda behavior related to memory

Question

so basically i took my c++ code (which is working correctly) and rewrite it to cuda (i have no experience with cuda). The one part of the code (solve() method) is not working correctly and i really dont know why.

So my question is what exactly means "unspecified launch failure" error during cudaMemcpy and why is it happening in my code.

My second question is why variables backup_ans and ans differs when they compute the same thing?

#include "stdio.h"
#include <algorithm>

__device__ unsigned int primes[1024];
__device__ long long n = 1ll<<32; // #unsigned_integers




__device__ int hashh(long long x) {
      return (x>>1)%1024;
}

// compute (x^e)%n
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) {
    unsigned long long ans = 1;
    while(e>0) {
        if(e&1) ans = (ans*x)%n;
        x = (x*x)%n;
        e>>=1;
    }
    return ans;
}

// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(unsigned long long a,unsigned long long n) {
  int d=0;
  unsigned long long t = n-1;
  while(t%2==0) {
      ++d;
      t>>=1;
  }
  unsigned long long x = mulmod(a,t,n);
  if(x==1) return 1; 
  for(int i=0;i<d;++i) {
      if(x==n-1) return 1;
      x=(x*x)%n;
  }
  return 0;
}
__device__ int prime(long long x) {
        return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}

// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0; 
// count of those elements store in c
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads
__global__ void find(unsigned int *out,unsigned int *c) {
    unsigned int buff[4096];
    int local_c = 0;
    long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545;
    long long e = b+335545;
    if(b%2==0) ++b;
    for(long long i=b;i<e && i<n;i+=2) {
        if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue;
        if(hashh(i)==0) {
            buff[local_c++]=(unsigned int)i;
            if(local_c==4096) {
                int start = atomicAdd(c,local_c);
                for(int i=0;i<local_c;++i) out[i+start]=buff[i];
                local_c=0;
            }
        }
    }
    int start = atomicAdd(c,local_c);
    for(int i=0;i<local_c;++i) out[i+start]=buff[i];
}

// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) {
      __shared__ unsigned int s[32];
    unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1;
    unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
    unsigned int e = b+dif>(*count)?(*count):b+dif;
    unsigned int mysol = 0;
    for(long long i = 2; i<33; ++i) {
          int sol = 1;
          // each thread doing its part
          for(unsigned int j = b; j<e ; ++j) {
              //is some element is sprp base i break
              if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) {
              sol=0;
              break;
              }
          }
          // if all elements passed store base to mysol
          if(sol==1) mysol|=1<<(i-2);
    }
    s[threadIdx.x] = mysol;
    // save thread_result
    backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol;
    __syncthreads();
    // compute global resulte and store it to ans
    if(threadIdx.x==0) {
          unsigned int global_sol = ~0;
          for(int i=0;i<blockDim.x;++i) global_sol&=s[i];
          atomicAnd(ans,global_sol);
    }
}


int main(void) {
// number of blocks & thread for solve
const int blocks = 400;
const int threads = 32;

unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };

printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));

/*-----*/

// allocate buffers
unsigned int *dev_input,*dev_count;
printf("alloc_input: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23))));
printf("alloc_count: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_count,4)));
printf("memset_count: %s\n",cudaGetErrorString(cudaMemset(dev_count,0,4)));
find<<<400,32>>>(dev_input,dev_count);
cudaDeviceSynchronize();

unsigned int count;
printf("copy_count: %s\n",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost)));

// sort found elements just to make debbug easier, it is not necessary
unsigned int *backup_numbers = new unsigned int[1000000];
printf("copy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost)));
std::sort(backup_numbers,backup_numbers+count);
printf("copy_S_backup: %s\n",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice)));
delete[] backup_numbers;

printf("\nsize: %u\n",count);

// allocate buffers
unsigned int *dev_backup, *dev_ans;
printf("malloc_backup: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads)));
printf("malloc_ans: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4)));
printf("memset_ans: %s\n",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4)));

solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans);
cudaDeviceSynchronize();

unsigned int ans,*backup;
printf("memcpy_ans: %s\n",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost)));
backup = new unsigned int[400*32];
printf("memcpy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost)));
unsigned int backup_ans = ~0;

// compute global result using backuped thread_results
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!)
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i];
printf("ans: %u\nbackup_ans %u\n",ans,backup_ans);
printf("%u\n",backup[48]);

delete[] backup;
cudaFree(dev_ans);
cudaFree(dev_backup);
cudaFree(dev_count);
cudaFree(dev_input);
}

All code except solve() method works as intend. solve() method just computes bullshit (because backup_ans and ans differ) and it is also giving me the "unspecified launch failure" error on last two cudaMemcpy. When i run solve<<<1,1>>>(...) i got

ans: 134816642 backup_ans 432501552

but when i run solve<<<400,32>>>(...) it gives me

ans: 134816642 backup_ans 0 (correct answer should be 0)

In all situations it should compute backup_ans=ans=0

Any advice what i am doing wrong would be helpful.

Code for generating primes.bin

#include <cstdlib>
#include <stdio.h>
using namespace std;

const unsigned long long n = 1ll<<32;
const int buffer_size = 2000000;

typedef unsigned char uch;
typedef unsigned int uint;
typedef unsigned long long ull;

uch *primes;

int prime(long long x) {
if(x==2) return 1;
if(x%2==0) return 0;
long long pos = x/16;
long long index = (x&15)>>1;
return (1<<index)&(~(primes[pos]));
}
void eratosten_sieve(void) {
  long long pos;
  long long index;
  for(long long i=3;i*i<n;++i) {
      if(!prime(i)) continue;
      for(long long j=i*i;j<n;j+=(i<<1)) {
      pos = j/16;
      index = ((j&15)>>1);
      primes[pos]|=(1<<index);
      }
  }

}

int main(void) {
primes = new uch[(n/16)+1];
for(long long i=0;i<(n/16)+1;++i) primes[i]=0;
printf("generating\n");
eratosten_sieve(); 
int l = n/16 +1;
printf("writing\n");
FILE *f = fopen("primes.bin","wb");
fwrite(primes,1,l,f);
fclose(f);
printf("done\n");
delete[] primes;
}

PS: i am compiling it by nvcc -arch compute_11

CUDA Driver Version / Runtime Version          5.5 / 5.5
CUDA Capability Major/Minor version number:    1.1
Total amount of global memory:                 1023 MBytes (1073020928 bytes)
(14) Multiprocessors, (  8) CUDA Cores/MP:     112 CUDA Cores
GPU Clock rate:                                1500 MHz (1.50 GHz)
Memory Clock rate:                             900 Mhz
Memory Bus Width:                              256-bit
Maximum Texture Dimension Size (x,y,z)         1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers  1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers  2D=(8192, 8192), 512 layers
Total amount of constant memory:               65536 bytes
Total amount of shared memory per block:       16384 bytes
Total number of registers available per block: 8192
Warp size:                                     32
Maximum number of threads per multiprocessor:  768
Maximum number of threads per block:           512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size    (x,y,z): (65535, 65535, 1)
Maximum memory pitch:                          2147483647 bytes
Texture alignment:                             256 bytes
Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
Run time limit on kernels:                     Yes
Integrated GPU sharing Host Memory:            No
Support host page-locked memory mapping:       Yes
Alignment requirement for Surfaces:            Yes
Device has ECC support:                        Disabled
Device supports Unified Addressing (UVA):      No
Device PCI Bus ID / PCI location ID:           1 / 0
Compute Mode:
  < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT
Result = PASS

score 1 · Accepted Answer

好的，你的内存不足了。我花了一段时间才弄清楚，因为我没有考虑大型静态分配：

__device__ unsigned char primes[(1<<28)+1];

通常，当人们内存不足时，他们会在cudaMalloc操作中发现它。就你而言，你的 GPU 有 1GB 内存，我猜你也在上面托管一个显示器（你没有回答那个问题）。看看nvidia-smi -a输出中有多少可用内存，它看起来像这样：

FB Memory Usage
    Total                       : 1535 MiB
    Used                        : 3 MiB
    Free                        : 1532 MiB

您的数字会更小 - 免费线是我们关心的。

您的动态分配（即 from cudaMalloc）分配了大约 350MB。但是内核启动带来了静态分配，然后您的总占用空间上升到超过 700MB（2^28 超过 250MB）。如果您在该 GPU 上运行显示器，它将消耗 1GB 内存中的一部分，使您无法运行需要 700MB 的内核。

如果你想在那个 GPU 上运行，看看你是否能以某种方式减少你的问题大小。

进行适当的 cuda 错误检查总是好的，但除了这个问题之外，您的代码似乎在具有更多内存的设备上运行没有错误。

c++ - inexplicable cuda behavior related to memory

1 回答 1

Related

Reference