-1

我写了一个关于字符串比较的简单代码。代码如下所示。这很简单。只需比较字符串a和字符串b,如果对应的元素相同,则将5赋给新的矩阵s;如果对应的元素不同,则将-3赋给新的矩阵s。不存在编译错误。但结果不是我所期望的。请给我一些有用的建议。谢谢!

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "book.h"
#define M 6
#define BLOCK_SIZE 30 // maximum 1024 threads per block
#define GRID_SIZE 30 // 900 blocks per grid
#define P (900 * 900)

void Init();


char *gpu_a;
char *gpu_b;
float *gpu_s;
float *cpu_s;


char cpu_a[6] = {'A', 'T', 'G', 'C', 'G', 'T'};
char cpu_b[6] = {'G', 'T', 'G', 'A', 'T', 'G'};

void cpu_Allocate1dArray()
{
   //cpu_a = (char*) malloc( M * sizeof( char) );
   //cpu_b = (char*) malloc( M * sizeof(char) );
   cpu_s = (float*) malloc( M * sizeof( float) );
}

void gpu_Allocate1dArray()
{
    cudaMalloc( (void**)&gpu_a, M * sizeof(char) );
    cudaMalloc( (void**)&gpu_b, M * sizeof(char) );
cudaMalloc( (void**)&gpu_s, M * sizeof(float));
}


__global__ void mykernel( char *gpu_a, char *gpu_b, float *gpu_s)
{
    int i , j , tid;
    i = threadIdx.x + blockIdx.x * blockDim.x;
j = threadIdx.y + blockIdx.y * blockDim.y;
    tid = i + j * blockDim.x * gridDim.x;
    if ( tid  < P)
{
       if( gpu_a[i] == gpu_b[j])
        {
      gpu_s[tid] = 5;
    }
       else
    gpu_s[tid] = -3;
    }         
}


int main()
{
   int q;
   cpu_Allocate1dArray();
   gpu_Allocate1dArray();

   Init();

   dim3 gridDim;
   dim3 blockDim;

   blockDim.x = blockDim.y = BLOCK_SIZE;
   gridDim.x = gridDim.y = GRID_SIZE;

   cudaMemcpy( gpu_a, cpu_a, sizeof(char) * M, cudaMemcpyHostToDevice);
   cudaMemcpy( gpu_b, cpu_b, sizeof(char) * M, cudaMemcpyHostToDevice);
   mykernel<<<gridDim, blockDim>>>(gpu_a, gpu_b, gpu_s);
   cudaMemcpy( cpu_s, gpu_s, sizeof(float)* M, cudaMemcpyDeviceToHost);

   for (q = 0; q < M; q++)
       printf("%f ", cpu_s[q]);


   printf("\n");    

    //Free device memory

    free(cpu_s);

    cudaFree(gpu_s);

    cudaFree(gpu_a);
    cudaFree(gpu_b); 

    return 0;

}        

void Init()
{
   int i;
   for (i = 0; i < M; i++)

   cpu_s[i] = 0;
}

结果是: [Smith@server]$ ./test88.exe 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

4

1 回答 1

2

您的代码试图超出数组长度 - gpu_s 长度为 6 * sizeof(float) 而 tid 最高可达 900*900。

将 P 设置为 6 打印输出:-3.000000 -3.000000 5.000000 -3.000000 5.000000 -3.000000

注意 - 您可以通过使用cuda-memcheck运行应用程序来轻松检测此类问题。

于 2012-12-06T23:33:11.383 回答