我想编写一个 CUDA 内核,它将乘以 2 个矩阵 NxN 大小。我确实设法做到了,但是没有线程合作......现在我想用线程合作来做,我按照SDK中提供的代码进行操作。但由于某种原因,内核返回不同的结果。所以这里是 .cu 文件:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<device_functions.h>
static void HandleError(cudaError_t err, const char *file, int line)
{
if(err!=cudaSuccess){
printf("%s in %s file at line %s\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#define ORDER 4
__global__ void matrixMul( int* A, int* B, int* C, int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = wA * ORDER * by;
int aEnd = aBegin + wA - 1;
int aStep = ORDER;
int bBegin = ORDER * bx;
int bStep = ORDER * wB;
int Csub=0;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
__shared__ int As[ORDER][ORDER];
__shared__ int Bs[ORDER][ORDER];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
__syncthreads();
#pragma unroll
for (int k = 0; k < ORDER; ++k)
Csub += As[ty][k] * Bs[k][tx];
__syncthreads();
}
int c = wB * ORDER * by + ORDER * bx;
C[c + wB * ty + tx] = Csub;
}
#endif
int main()
{
int *a=(int*)malloc(ORDER*ORDER*sizeof(int));
int *b=(int*)malloc(ORDER*ORDER*sizeof(int));
int *c=(int*)malloc(ORDER*ORDER*sizeof(int));
int *dev_a, *dev_b, *dev_c;
HANDLE_ERROR(cudaMalloc((void**)&dev_a, ORDER*ORDER*sizeof(int*)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, ORDER*ORDER*sizeof(int*)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, ORDER*ORDER*sizeof(int*)));
for(int i=0; i<ORDER*ORDER; i++)
{
a[i]=1;
b[i]=2;
}
HANDLE_ERROR(cudaMemcpy(dev_a, a, ORDER*ORDER*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, ORDER*ORDER*sizeof(int), cudaMemcpyHostToDevice));
matrixMul<<<ORDER, ORDER>>>(dev_a, dev_b, dev_c, ORDER, ORDER);
HANDLE_ERROR(cudaMemcpy(c, dev_c, ORDER*ORDER*sizeof(int), cudaMemcpyDeviceToHost));
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", a[i]);
}
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", b[i]);
}
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
是的,我知道没有“真正的”问题......但如果有人能指出我正确的方向,我将不胜感激。谢谢!
如果您需要更多代码示例,请告诉我,我将编辑问题。
编辑#1:我忘了提...我无法在 Visual Studi 2010 中实现 nvcc,所以我无法使用调试器。对此有何建议?
编辑#2:更新了问题,因此它同时显示了 CUDA 内核和 main。