random - CUDA - 多线程

Question

我正在尝试使用 CUDA 和 GPU 使 LCG 随机数生成器并行运行。但是，我实际上无法同时运行多个线程。这是代码的副本：

#include <iostream>
#include <math.h>

__global__ void rng(long *cont)
{

    int a=9, c=3, F, X=1; 
    long M=524288, Y;     
    printf("\nKernel X is %d\n", X[0]);     
    F=X;
    Y=X;
    printf("Kernel F is %d\nKernel Y is %d\n", F, Y);
    Y=(a*Y+c)%M;
    printf("%ld\t", Y);
    while(Y!=F)
    {
        Y=(a*Y+c)%M;
        printf("%ld\t", Y);
    cont[0]++;
    }
}
int main()
{
    long cont[1]={1};
    int X[1];
    long *dev_cont;
    int *dev_X;
    cudaEvent_t beginEvent;
    cudaEvent_t endEvent;
    cudaEventCreate( &beginEvent );
    cudaEventCreate( &endEvent );
    printf("Please give the value of the seed X ");
    scanf("%d", &X[0]);
    printf("Host X is: %d", *X);
    cudaEventRecord( beginEvent, 0);
    cudaMalloc( (void**)&dev_cont, sizeof(long) );
    cudaMalloc( (void**)&dev_X, sizeof(int) );
    cudaMemcpy(dev_cont, cont, 1 * sizeof(long), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_X, X, 1 * sizeof(int), cudaMemcpyHostToDevice);
    rng<<<1,1>>>(dev_cont);
    cudaMemcpy(cont, dev_cont, 1 * sizeof(long), cudaMemcpyDeviceToHost);
    cudaEventRecord( endEvent, 0);
    cudaEventSynchronize (endEvent );
    float timevalue;
    cudaEventElapsedTime (&timevalue, beginEvent, endEvent);
    printf("\n\nYou generated a total of %ld numbers", cont[0]);
    printf("\nCUDA Kernel Time: %.2f ms\n", timevalue);
    cudaFree(dev_cont);
    cudaFree(dev_X);
    cudaEventDestroy( endEvent );
    cudaEventDestroy( beginEvent );
    return 0;
}

现在我只用一个线程发送一个块。但是，如果我发送 100 个线程，唯一会发生的事情是它会产生相同的数字 100 次，然后继续下一个数字。从理论上讲，这是意料之中的，但当一个数字重复时，它会自动忽略“随机数”的目的。

我想要实现的想法是拥有多个线程。一个线程将使用该公式：Y=(a*Y+c)%M 但使用初始值 Y=1，然后另一个线程将使用相同的公式但初始值为 Y=1000 等。但是，一旦第一个线程产生了 1000 个数字，它就需要停止进行更多的计算，因为如果它继续下去，它将干扰第二个线程产生值 Y=1000 的数字。

如果有人能指出正确的方向，至少在创建具有不同功能或指令的多个线程以并行运行的方式上，我将尝试找出其余的。

谢谢！

更新：美国东部标准时间 7 月 31 日晚上 8:14

我将我的代码更新为以下内容。基本上我正在尝试产生 256 个随机数。我创建了将存储这 256 个数字的数组。我还为线程中的 Y 值创建了一个包含 10 个不同种子值的数组。我还更改了代码以请求设备中的 10 个线程。我还保存了在数组中生成的数字。代码无法正常工作。请告知如何修复它或如何使其达到我想要的效果。

谢谢！

#include <iostream>
#include <math.h>

__global__ void rng(long *cont, int *L, int *N)
{

    int Y=threadIdx.x;
    Y=N[threadIdx.x];
    int a=9, c=3, i;
    long M=256;
    for(i=0;i<256;i++)
    {
        Y=(a*Y+c)%M;
        N[i]=Y;
        cont[0]++;
    }
}
int main()
{
    long cont[1]={1};
    int i;
    int L[10]={1,25,50,75,100,125,150,175,200,225}, N[256];
    long *dev_cont;
    int *dev_L, *dev_N;
    cudaEvent_t beginEvent;
    cudaEvent_t endEvent;
    cudaEventCreate( &beginEvent );
    cudaEventCreate( &endEvent );
    cudaEventRecord( beginEvent, 0);
    cudaMalloc( (void**)&dev_cont, sizeof(long) );
    cudaMalloc( (void**)&dev_L, sizeof(int) );
    cudaMalloc( (void**)&dev_N, sizeof(int) );
    cudaMemcpy(dev_cont, cont, 1 * sizeof(long), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_L, L, 10 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_N, N, 256 * sizeof(int), cudaMemcpyHostToDevice);
    rng<<<1,10>>>(dev_cont, dev_L, dev_N);
    cudaMemcpy(cont, dev_cont, 1 * sizeof(long), cudaMemcpyDeviceToHost);
    cudaMemcpy(N, dev_N, 256 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventRecord( endEvent, 0);
    cudaEventSynchronize (endEvent );
    float timevalue;
    cudaEventElapsedTime (&timevalue, beginEvent, endEvent);
    printf("\n\nYou generated a total of %ld numbers", cont[0]);
    printf("\nCUDA Kernel Time: %.2f ms\n", timevalue);
    printf("Your numbers are:");
    for(i=0;i<256;i++)
    {
        printf("%d\t", N[i]);
    }
    cudaFree(dev_cont);
    cudaFree(dev_L);
    cudaFree(dev_N);
    cudaEventDestroy( endEvent );
    cudaEventDestroy( beginEvent );
    return 0;
}

@Bardia - 请让我知道如何更改代码以满足我的需求。

更新：美国东部标准时间 8 月 1 日下午 5:39

我编辑了我的代码以适应@Bardia 对内核代码的修改。然而，数字生成过程中出现了一些错误。首先，我在内核中创建的用于计算正在创建的数字数量的计数器不起作用。最后它只显示生成了“1”号。我创建的用于测量内核执行指令所需时间的计时器也无法正常工作，因为它一直显示 0.00 毫秒。根据我为公式设置的参数，正在生成并复制到数组中然后打印在屏幕上的数字并不反映要出现（甚至接近）的数字。这些都是以前工作的。

这是新代码：

#include <iostream>
#include <math.h>

__global__ void rng(long *cont, int *L, int *N)
{

    int Y=threadIdx.x;
    Y=L[threadIdx.x];
    int a=9, c=3, i;
    long M=256;
    int length=ceil((float)M/10); //256 divided by the number of threads.
    for(i=(threadIdx.x*length);i<length;i++)
    {
        Y=(a*Y+c)%M;
        N[i]=Y;
        cont[0]++;
    }
}
int main()
{
    long cont[1]={1};
    int i;
    int L[10]={1,25,50,75,100,125,150,175,200,225}, N[256];
    long *dev_cont;
    int *dev_L, *dev_N;
    cudaEvent_t beginEvent;
    cudaEvent_t endEvent;
    cudaEventCreate( &beginEvent );
    cudaEventCreate( &endEvent );
    cudaEventRecord( beginEvent, 0);
    cudaMalloc( (void**)&dev_cont, sizeof(long) );
    cudaMalloc( (void**)&dev_L, sizeof(int) );
    cudaMalloc( (void**)&dev_N, sizeof(int) );
    cudaMemcpy(dev_cont, cont, 1 * sizeof(long), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_L, L, 10 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_N, N, 256 * sizeof(int), cudaMemcpyHostToDevice);
    rng<<<1,10>>>(dev_cont, dev_L, dev_N);
    cudaMemcpy(cont, dev_cont, 1 * sizeof(long), cudaMemcpyDeviceToHost);
    cudaMemcpy(N, dev_N, 256 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventRecord( endEvent, 0);
    cudaEventSynchronize (endEvent );
    float timevalue;
    cudaEventElapsedTime (&timevalue, beginEvent, endEvent);
    printf("\n\nYou generated a total of %ld numbers", cont[0]);
    printf("\nCUDA Kernel Time: %.2f ms\n", timevalue);
    printf("Your numbers are:");
    for(i=0;i<256;i++)
    {
        printf("%d\t", N[i]);
    }
    cudaFree(dev_cont);
    cudaFree(dev_L);
    cudaFree(dev_N);
    cudaEventDestroy( endEvent );
    cudaEventDestroy( beginEvent );
    return 0;
}

这是我收到的输出：

[wigberto@client2 CUDA]$ ./RNG8


You generated a total of 1 numbers
CUDA Kernel Time: 0.00 ms
Your numbers are:614350480      32767   1132936976      11079   2       0       10      0       1293351837      0       -161443660      48      0       0       614350336       32767    1293351836      0       -161444681      48      614350760       32767   1132936976      11079   2       0       10      0       1057178751      0       -161443660      48       155289096       49      614350416       32767   1057178750      0       614350816       32767   614350840       32767   155210544       49      0       0       1132937352       11079   1130370784      11079   1130382061      11079   155289096       49      1130376992      11079   0       1       1610    1       1       1       1130370408      11079    614350896       32767   614350816       32767   1057178751      0       614350840       32767   0       0       -161443150      48      0       0       1132937352      11079    1       11079   0       0       1       0       614351008       32767   614351032       32767   0       0       0       0       0       0       1130369536      1       1132937352       11079   1130370400      11079   614350944       32767   1130369536      11079   1130382061      11079   1130370784      11079   1130365792      11079   6143510880       614351008       32767   -920274837      0       614351032       32767   0       0       -161443150      48      0       0       0       0       1       0       128     0-153802168      48      614350896       32767   1132839104      11079   97      0       88      0       1       0       155249184       49      1130370784      11079   0       0-1      0       1130364928      11079   2464624 0       4198536 0       4198536 0       4197546 0       372297808       0       1130373120      11079   -161427611      48      111079   0       0       1       0       -153802272      48      155249184       49      372297840       0       -1      0       -161404446      48      0       0       0       0372298000       0       372297896       0       372297984       0       0       0       0       0       1130369536      11079   84      0       1130471067      11079   6303744 0614351656       32767   0       0       -1      0       4198536 0       4198536 0       4197546 0       1130397880      11079   0       0       0       0       0       0       00       0       0       -161404446      48      0       0       4198536 0       4198536 0       6303744 0       614351280       32767   6303744 0       614351656       32767   614351640        32767   1       0       4197371 0       0       0       0       0       [wigberto@client2 CUDA]$

@Bardia - 请告知这里最好的做法。

谢谢！

score 2 · Accepted Answer

threadIdx您可以按变量寻址块内的线程。即，在您的情况下，您可能应该设置

Y = threadIdx.x然后使用Y=(a*Y+c)%M

但总的来说，在 CUDA 上实现一个好的 RNG 可能真的很困难。所以我不知道你是否想实现自己的生成器只是为了练习..

否则，有一个可用的 CURAND 库，它提供了许多伪和准随机生成器，即。XORWOW、MersenneTwister、Sobol 等

score 0 · Accepted Answer

它应该在所有线程中执行相同的工作，因为您希望它们执行相同的工作。您应该始终通过寻址来区分线程。

例如，你应该说线程#1 你做这个工作并保存你在这里的工作，线程#2 你做那个工作并在那里保存你的工作，然后转到主机并使用该数据。

对于每个块中具有二维线程的二维块网格，我使用此代码进行寻址：

int X = blockIdx.x*blockDim.x+threadIdx.x;
int Y = blockIdx.y*blockDim.y+threadIdx.y;

上面代码中的XandY是你的线程的全局地址（我认为你的一维网格和线程就足够了）。

另请记住，您不能printf在内核上使用该功能。GPU 不能进行任何中断。为此，您可以使用cuPrintf作为 CUDA SDK 示例之一的函数，但请阅读它的说明以正确使用它。

score 0 · Accepted Answer

该答案与问题的已编辑部分有关。

我没有注意到它是一种递归算法，不幸的是我不知道如何并行化递归算法。

我生成这 256 个数字的唯一想法是分别生成它们。即在第一个线程中生成 26 个，在第二个线程中生成 26 个，依此类推。此代码将执行此操作（这只是内核部分）：

#include <iostream>
#include <math.h>

__global__ void rng(long *cont, int *L, int *N)
{

    int Y=threadIdx.x;
    Y=L[threadIdx.x];
    int a=9, c=3, i;
    long M=256;
    int length=ceil((float)M/10); //256 divided by the number of threads.
    for(i=(threadIdx.x*length);i<length;i++)
    {
        Y=(a*Y+c)%M;
        N[i]=Y;
        cont[0]++;
    }
}

random - CUDA - 多线程

更新：美国东部标准时间 7 月 31 日晚上 8:14

更新：美国东部标准时间 8 月 1 日下午 5:39

3 回答 3

Related

Reference