尽管我遵循了“CUDA Programming Guide”中的附录 C“Compiling Dynamic Parallelism”以及此处给出的解决方案,但我无法解决我遇到的问题。编译和链接(制作 DivideParalelo)后,出现以下错误:
./build/metodos.o: In function `__sti____cudaRegisterAll_42_tmpxft_00002599_00000000_6_metodos_cpp1_ii_32c9141e()':
tmpxft_00002599_00000000-3_metodos.cudafe1.cpp:(.text.startup+0x15): undefined reference to `__cudaRegisterLinkedBinary_42_tmpxft_00002599_00000000_6_metodos_cpp1_ii_32c9141e'
./build/GPUutil.o: In function `__sti____cudaRegisterAll_42_tmpxft_000025c0_00000000_6_GPUutil_cpp1_ii_f81fb8b5()':
tmpxft_000025c0_00000000-3_GPUutil.cudafe1.cpp:(.text.startup+0x15): undefined reference to `__cudaRegisterLinkedBinary_42_tmpxft_000025c0_00000000_6_GPUutil_cpp1_ii_f81fb8b5'
./build/PCA_Kernels.o: In function `__sti____cudaRegisterAll_46_tmpxft_000025e6_00000000_6_PCA_Kernels_cpp1_ii_8a59b72a()':
tmpxft_000025e6_00000000-3_PCA_Kernels.cudafe1.cpp:(.text.startup+0x15): undefined reference to `__cudaRegisterLinkedBinary_46_tmpxft_000025e6_00000000_6_PCA_Kernels_cpp1_ii_8a59b72a'
./build/DivideParalelo.o: In function `__sti____cudaRegisterAll_49_tmpxft_0000260c_00000000_6_DivideParalelo_cpp1_ii_16d0a16f()':
tmpxft_0000260c_00000000-3_DivideParalelo.cudafe1.cpp:(.text.startup+0x385): undefined reference to `__cudaRegisterLinkedBinary_49_tmpxft_0000260c_00000000_6_DivideParalelo_cpp1_ii_16d0a16f'
make: *** [DivideParalelo] Error 1
下面列出了我的代码的简化版本。
DivideParalelo.cu:
#include <stdio.h> #include <string.h>
/*C includes*/
extern"C" {
#include"io.h"
#include"util.h"
}
/* CUDA includes*/
#include"cuda.h"
#include"cublas.h"
#include"metodos.h"
#define CUDA_CHECK_RETURN(value) {
/...
}
#define DIM 100
/*
* image
* num_bands
* columns initially is lines_samples, later the number of endmembers
*/
__global__ void Divide(double *image, int num_bands, int columns, int DIM_MIN, int numColsLastPiece, double *out, double *piece) {
int tid=threadIdx.x; //col
int bid=blockIdx.x; //row
for (int tile=0;tile<(columns -1)/ DIM_MIN +1;tile++) {
__shared__ double sh_piece[DIM];
//some code here...
__syncthreads();
}
int mat=HYSIME(piece,columns,num_bands);
}
}
int main(int argc,
char** argv) {
//load file (argv[1]) with the image into dMt
//...
//Allocate GPU memory:
double *devicedM, *deviceOut;
CUDA_CHECK_RETURN(cudaMalloc((void**)&devicedM, num_bands*lines_samples*sizeof(double)));
CUDA_CHECK_RETURN(cudaMalloc((void**)&deviceOut, num_bands*lines_samples*sizeof(double)));
//here the call to the kernel
}
方法.cu:
extern "C"{
#include "util.h"
#include "io.h"
}
#include "cuda.h"
#include "cublas.h"
#include "PCA_Kernels.h"
#include "GPUutil.h"
#include <stdio.h>
__device__ __host__ int HYSIME(double *M, int lines_samples, int num_bands){
int N_END =0;
double *y;
double *w;
double *Rw;
y = (double*) malloc(lines_samples * num_bands * sizeof(double));
//changed to implement calloc in the device:
w = (double*) malloc(lines_samples * num_bands*sizeof(double));
memset (w,0,lines_samples * num_bands);
Rw = (double*) malloc(num_bands * num_bands* sizeof(double));
memset (Rw,0,num_bands * num_bands);
//some additional code here
estNoise(y, w, Rw, num_bands, lines_samples);//GPUutil.cu
return(N_END);
}
GPUutil.cu:
#include "cublas.h"
#include "cuda.h"
#include "cuda_runtime.h"
__device__ __host__ int destAdditiveNoise(double *r, double *w, double *Rw, int L, int N){
//the code
return (0);
}
__device__ __host__ int estNoise(double *y, double *w, double *Rw, int L, int N){
//the code
return (0);
}
__device__ __host__ int hysime(double *y, double *w, double *Rw, int L, int N){ //L is num_bands N is lines_samples
//the code
return(0);
}
生成文件:
MKL =1
#initial definitions (library paths et al.)
CUDA_PATH=/usr/local/cuda-6.5
MKLROOT=/home/emartel/intel/composer_xe_2015.0.090/mkl
BUILD_DIR=./build
####################
#includes
####################
#Cuda includes
CUDA_INCLUDE_DIR=-I. -I$(CUDA_PATH)/include
#-I$(SDK)/C/common/inc
#BLAS includes
BLAS_INCLUDE_DIR=-I. -I$(MKLROOT)/include
####################
#library search paths
####################
CUDA_LIB_DIR=-L$(CUDA_PATH)/lib64
#-L$(SDK)/C/lib -L$(SDK)/C/common/lib/linux
BLAS_LIB_DIR=-L$(MKLROOT)/lib/intel64 -L$(MKLROOT)/../compiler/lib/intel64
####################
#libraries
####################
CUDALIBS=-lcublas -lcudart
#-lcutil
#-lGL -lGLU
utilS= -lpthread -lm
####################
#other compilation flags
####################
CFLAGS= -Wwrite-strings
#-Wall
#-g
MKLFLAGS=-D __MKL
#sergio CUDAFLAGS= --gpu-architecture sm_30
#changed with sm_35
CUDAFLAGS= -arch=sm_35
LINKERFLAGS= -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a $(MKLROOT)/../compiler/lib/intel64/libiomp5.a -Wl,--end-group
####################
#utilities
####################
io.o : io.c
icc $(CFLAGS) -c -O3 io.c -o $(BUILD_DIR)/io.o
#BLAS and LAPACK wrapper
util.o : util.c
icc $(CFLAGS) $(MKLFLAGS) $(BLAS_INCLUDE_DIR) -c -O3 util.c -o $(BUILD_DIR)/util.o
#changed with rdec and -lcudadevrt:
metodos.o : metodos.cu
nvcc $(CUDAFLAGS) $(CUDA_INCLUDE_DIR) -c -O3 -rdc=true metodos.cu -lcudadevrt -o $(BUILD_DIR)/metodos.o
##################################
# PCA files
##################################
#changed with rdec and -lcudadevrt:
GPUutil.o: GPUutil.cu
nvcc $(CUDAFLAGS) $(CUDA_INCLUDE_DIR) -c -O3 -rdc=true GPUutil.cu -lcudadevrt -o $(BUILD_DIR)/GPUutil.o
#changed with rdec and -lcudadevrt:
PCA_Kernels.o: PCA_Kernels.cu
nvcc $(CUDAFLAGS) $(CUDA_INCLUDE_DIR) -c -O3 -rdc=true PCA_Kernels.cu -lcudadevrt -o $(BUILD_DIR)/PCA_Kernels.o
#changed with rdec and -lcudadevrt:
DivideParalelo.o: DivideParalelo.cu
nvcc $(CUDAFLAGS) $(CUDA_INCLUDE_DIR) -c -O3 -rdc=true DivideParalelo.cu -lcudadevrt -o $(BUILD_DIR)/DivideParalelo.o
#everything is already compiled, this is just a call to the linker
DivideParalelo: io.o util.o metodos.o GPUutil.o PCA_Kernels.o DivideParalelo.o
icc $(CFLAGS) $(BUILD_DIR)/io.o $(BUILD_DIR)/util.o $(BUILD_DIR)/metodos.o $(BUILD_DIR)/GPUutil.o $(BUILD_DIR)/PCA_Kernels.o $(BUILD_DIR)/DivideParalelo.o $(CUDA_LIB_DIR) $(BLAS_LIB_DIR) $(LINKERFLAGS) $(utilS) $(CUDALIBS) -o DivideParalelo
####################
#misc
####################
clean:
rm -rf $(BUILD_DIR)/*.o ./DivideParalelo
任何建议将不胜感激。也许我误解了动态并行的单独编译。