I have the source code which can parallel a model using GPU library. Now I was trying to compile the source code under UNIX system. However the computation time wasn't speed up when I used more than one GPU nodes (5,10,20,30......). So I think I didn't use the right compiler. 'solve.cu' is the source code which uses the CUDA library. The makefile I wrote is as the following:
# SHELL=/bin/ksh
SOURCELOC =
UTILITYLOC =
NEWMOD =
PROGRAM = mf2k
INCDIR= .
# Define the Fortran compile flags
F77FLAGS=
F90FLAGS=
F77= mpif77
F90= mpif90
#
# Define the C compile flags
# -D_UF defines UNIX naming conventions for mixed language compilation.
#
CFLAGS= -D_UF -O3
CC= mpicc
# Define GMG objects
#
GMG = r_vector.o\
solvers.o\
ccfd.o\
mf2kgmg.o\
gmg1.o
#
# Define the Cuda compile flags
# -D_UF defines UNIX naming conventions for mixed language compilation.
#
CUDAFLAGS=
CUDACC= nvcc
CUDA_INC= -I /opt/apps/cuda/4.1.28/include
VT_MPI_INC= -I /opt/apps/intel13_1/openmpi/1.6.4/include
CUDA_LIB64= /opt/apps/cuda/4.1.28/lib64
VT_MPI_LIB= /opt/apps/intel13_1/openmpi/1.6.4/lib
LFLAGS = -L$(VT_MPI_LIB) -lmpi -L$(CUDA_LIB64) -lcuda -lcudart
CUSPINCDIR= -I /home/zhangmj/MF2K_JIXIAOHUI_MAKE
# Define CUSP objects
#
CUSP = solve.o
# Define the libraries
#SYSLIBS= -lmisalign -ldgc -lm
SYSLIBS= -lc
USRLIB =
# Define all object files which make up Modtools
OBJECTS = \
outputA_b.o \
mf2k.o \
mhc1.o \
ctime.o \
daf1.o \
de45.o \
glo1bas6.o \
gutsdaf.o \
gwf1bas6.o \
gwf1bcf6.o \
gwf1chd6.o \
gwf1drn6.o \
gwf1drt1.o \
gwf1ets1.o \
gwf1evt6.o \
gwf1fhb1.o \
gwf1gag5.o \
gwf1ghb6.o \
gwf1hfb6.o \
gwf1huf2.o \
gwf1ibs6.o \
gwf1lak3.o \
gwf1lpf1.o \
gwf1mnw1.o \
gwf1mnw2.o \
gwf1mnwi.o \
gwf1rch6.o \
gwf1res1.o \
gwf1riv6.o \
gwf1sfr2.o \
gwf1str6.o \
gwf1sub1.o \
gwf1swt1.o \
gwf1wel6.o \
hufutl2.o \
hydmod.o \
lmg1.o \
lmt6.o \
memchk.o \
obs1adv2.o \
obs1bas6.o \
obs1drn6.o \
obs1drt1.o \
obs1ghb6.o \
obs1riv6.o \
obs1str6.o \
parutl1.o \
pcg2.o \
pes1bas6.o \
pes1gau1.o \
rtedaf.o \
sen1bas6.o \
sen1chd6.o \
sen1drn6.o \
sen1drt1.o \
sen1ets1.o \
sen1evt6.o \
sen1ghb6.o \
sen1hfb6.o \
sen1huf2.o \
sen1lpf1.o \
sen1rch6.o \
sen1riv6.o \
sen1str6.o \
sen1wel6.o \
sip5.o \
sor5.o \
utl6.o \
para-non.o
install: mf2k
# Define Task Function Program Modtools
all: mf2k
# Define what Modtools is
mf2k: $(OBJECTS) $(GMG) $(CUSP) -$(F77) $(F77FLAGS) -o mf2k -L /opt/apps/cuda/4.1.28/lib64 -lcudart $(OBJECTS) $(GMG) $(CUSP) $(USRLIB) $(SYSLIBS)
# Modtools_Object codes
mf2k.o: mf2k.f
$(F77) $(F77FLAGS) -c mf2k.f
para-non.o: serial/para-non.f
$(F77) $(F77FLAGS) -I$(INCDIR) -c serial/para-non.f
# Pth_Object codes of Modtools
.f.o:
$(F77) $(F77FLAGS) -c $<
mhc1.o:
$(F90) $(F90FLAGS) -c mhc1.f90
.c.o:
$(CC) $(CFLAGS) -c $<
solve.o: solve.cu
$(CUDACC) -c -arch sm_13 $(LFLAGS) $(VT_MPI_INC) $(CUDA_INC) $(CUSPINCDIR) solve.cu
# end
I would greatly appreciate any help. Thank you so much.