我正在编写一个程序来比较 JBlas 和 JCublas 的速度。当我第一次调用以下函数时,一切正常,并且 v 包含正确的特征向量。当我第二次调用它时,计算时间要少得多,但只返回输入的对称矩阵a
,就好像d_A
的值从未改变一样。
该函数似乎只在奇数调用上按预期工作。我有一种预感,这个错误是由于 GPU 内存中的某些东西没有被正确清除,但我找不到它。
public static void getSymEigenGPU(cusolverDnHandle handle,
DoubleMatrix a) {
int n2 = a.length;
int n = a.rows;
double[] a1d = to1d(a);
double[] v = new double[n2];
double[] w = new double[n];
Pointer h_A = Pointer.to(a1d);
Pointer h_V = Pointer.to(v);
Pointer h_W = Pointer.to(w);
Pointer d_A = new Pointer();
Pointer d_V = new Pointer();
Pointer d_W = new Pointer();
Pointer d_work = new Pointer();
JCuda.cudaMalloc(d_A, (long) n2 * Sizeof.DOUBLE);
JCuda.cudaMalloc(d_V, (long) n2 * Sizeof.DOUBLE);
JCuda.cudaMalloc(d_W, n * Sizeof.DOUBLE);
int jobz = CUSOLVER_EIG_MODE_VECTOR;
int uplo = CUBLAS_FILL_MODE_UPPER;
JCuda.cudaMemcpy(d_A, h_A, (long) n2 * Sizeof.DOUBLE,
cudaMemcpyHostToDevice);
int[] lworkl = new int[1];
JCusolverDn.cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, d_A, n,
d_W, lworkl);
int lwork = lworkl[0];
JCuda.cudaMalloc(d_work, (long) lwork * Sizeof.DOUBLE);
NanoStopWatch sw = NanoStopWatch.sw();
JCusolverDn.cusolverDnDsyevd(handle, jobz, uplo, n, d_A, n,
d_W, d_work, n2, new Pointer());
System.out.println("sw.stop() = " + sw.stop());
JCuda.cudaMemcpy(h_W, d_W, Sizeof.DOUBLE * n, cudaMemcpyDeviceToHost);
JCuda.cudaMemcpy(h_V, d_A, (long) Sizeof.DOUBLE * n2,
cudaMemcpyDeviceToHost);
pp(from1d(v));
JCuda.cudaFree(d_A);
JCuda.cudaFree(d_V);
JCuda.cudaFree(d_W);
JCuda.cudaFree(d_work);
}