我正在尝试编译一个基本的 CUDA 矩阵乘法程序,但我遇到了这个错误:
nvcc -I. -I/usr/local/cuda/include -c matrixMult1.cu -o matrixMult1.o
make: nvcc: Command not found
make: *** [matrixMult1.o] Error 127
我最初遇到了另一个错误,建议我使用 nvcc,唯一的问题是我对 nvcc 一无所知。有人有想法吗?提前致谢!
生成文件:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.cu
$(GCC) $(INCLUDES) -c matrixMult1.cu -o $@
matrixMult1: matrixMult1.o
$(GCC) -o $@ matrixMult1.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
核心:
//********************************************************************
// matrixMul_kernel.cu
//
// Kernel for a basic matrix multiplication program.
//********************************************************************
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
/* Thread block size */
#define BLOCK_SIZE 3
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
/* CUDA Kernel */
__global__ void matrixMul (float * C, float * A, float * B, int wA,
int wB) {
/* Two dimensional thread ID */
int tx = threadIdx.x;
int ty = threadIdx.y;
/* Computation holder variable */
float value = 0;
/* Loop through row of A and column of B to compute cell of C */
for (int i = 0; i < wA; ++i) {
float elementA = A[ty * wA + i];
float elementB = B[i * wB + tx];
value += elementA * elementB;
}
/* Write the result to C */
C[ty * wA + tx] = value;
}
#endif
主程序:
//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMul_kernel.cu>
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
void initMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Set random seed */
srand(2013);
/* Compute memory sizes for matrices A, B, and C */
unsigned int sizeA = WA * HA;
unsigned int sizeB = WB * HB;
unsigned int sizeC = WC * HC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/* Allocate memory for matrices A, B, and C */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(matrixA, sizeA);
initMatrix(matrixB, sizeB);
/* Print matrix A */
printf("\nMatrix A:\n");
for (int i = 0; i < sizeA; i++) {
printf("%f ", matrixA[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Print matrix B */
printf("\nMatrix B:\n");
for (int i = 0; i < sizeB; i++) {
printf("%f ", matrixB[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Allocate device memory */
float* deviceMemA;
float* deviceMemB;
float* deviceMemC;
cudaMalloc((void**) &deviceMemA, memoryA);
cudaMalloc((void**) &deviceMemB, memoryB);
cudaMalloc((void**) &deviceMemC, memoryC);
/* Copy host memory to device */
cudaMemcpy(deviceMemA, matrixA, memoryA,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemB, matrixB, memoryB,
cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
/* Execute kernel */
matrixMul<<< grid, threads >>>(deviceMemC, deviceMemA,
deviceMemB, WA, WB);
cudaMemcpy(deviceMemC, matrixC, memoryC,
cudaMemcpyHostToDevice);
/* Print matrix C */
printf("\nMatrix C:\n");
for (int i = 0; i < sizeC; i++) {
printf("%f ", matrixC[i]);
if (((i + 1) % WC) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
printf("\n");
/* Free up memory */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceMemA);
cudaFree(deviceMemB);
cudaFree(deviceMemC);
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; ++i) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float)RAND_MAX;
}
}