我在 matlab central 上发布了这个,但没有得到任何回复,所以我想我会在这里重新发布。
我最近在 Matlab 中编写了一个在 for 循环中使用 FFT 的简单例程;FFT 在计算中占主导地位。出于实验目的,我在 mex 中编写了相同的例程,它调用了 FFTW 3.3 库。事实证明,对于非常大的数组,matlab 例程比 mex 例程运行得更快(大约快两倍)。mex 例程使用智慧并执行相同的 FFT 计算。我也知道 matlab 使用 FFTW,但他们的版本是否可能稍微优化一些?我什至使用了 FFTW_EXHAUSTIVE 标志,对于大型数组,它的速度仍然比 MATLAB 对应的慢两倍。此外,我确保我使用的 matlab 是带有“-singleCompThread”标志的单线程,并且我使用的 mex 文件未处于调试模式。只是好奇是否是这种情况-或者是否有一些我不知道的 matlab 正在使用的优化。谢谢。
这是墨西哥的部分:
void class_cg_toeplitz::analysis() {
// This method computes CG iterations using FFTs
// Check for wisdom
if(fftw_import_wisdom_from_filename("cd.wis") == 0) {
mexPrintf("wisdom not loaded.\n");
} else {
mexPrintf("wisdom loaded.\n");
}
// Set FFTW Plan - use interleaved FFTW
fftw_plan plan_forward_d_buffer;
fftw_plan plan_forward_A_vec;
fftw_plan plan_backward_Ad_buffer;
fftw_complex *A_vec_fft;
fftw_complex *d_buffer_fft;
A_vec_fft = fftw_alloc_complex(n);
d_buffer_fft = fftw_alloc_complex(n);
// CREATE MASTER PLAN - Do this on an empty vector as creating a plane
// with FFTW_MEASURE will erase the contents;
// Use d_buffer
// This is somewhat dangerous because Ad_buffer is a vector; but it does not
// get resized so &Ad_buffer[0] should work
plan_forward_d_buffer = fftw_plan_dft_r2c_1d(d_buffer.size(),&d_buffer[0],d_buffer_fft,FFTW_EXHAUSTIVE);
plan_forward_A_vec = fftw_plan_dft_r2c_1d(A_vec.height,A_vec.value,A_vec_fft,FFTW_WISDOM_ONLY);
// A_vec_fft.*d_buffer_fft will overwrite d_buffer_fft
plan_backward_Ad_buffer = fftw_plan_dft_c2r_1d(Ad_buffer.size(),d_buffer_fft,&Ad_buffer[0],FFTW_EXHAUSTIVE);
// Get A_vec_fft
fftw_execute(plan_forward_A_vec);
// Find initial direction - this is the initial residual
for (int i=0;i<n;i++) {
d_buffer[i] = b.value[i];
r_buffer[i] = b.value[i];
}
// Start CG iterations
norm_ro = norm(r_buffer);
double fft_reduction = (double)Ad_buffer.size(); // Must divide by size of vector because inverse FFT does not do this
while (norm(r_buffer)/norm_ro > relativeresidual_cutoff) {
// Find Ad - use fft
fftw_execute(plan_forward_d_buffer);
// Get A_vec_fft.*fft(d) - A_vec_fft is only real, but d_buffer_fft
// has complex elements; Overwrite d_buffer_fft
for (int i=0;i<n;i++) {
d_buffer_fft[i][0] = d_buffer_fft[i][0]*A_vec_fft[i][0]/fft_reduction;
d_buffer_fft[i][1] = d_buffer_fft[i][1]*A_vec_fft[i][0]/fft_reduction;
}
fftw_execute(plan_backward_Ad_buffer);
// Calculate r'*r
rtr_buffer = 0;
for (int i=0;i<n;i++) {
rtr_buffer = rtr_buffer + r_buffer[i]*r_buffer[i];
}
// Calculate alpha
alpha = 0;
for (int i=0;i<n;i++) {
alpha = alpha + d_buffer[i]*Ad_buffer[i];
}
alpha = rtr_buffer/alpha;
// Calculate new x
for (int i=0;i<n;i++) {
x[i] = x[i] + alpha*d_buffer[i];
}
// Calculate new residual
for (int i=0;i<n;i++) {
r_buffer[i] = r_buffer[i] - alpha*Ad_buffer[i];
}
// Calculate beta
beta = 0;
for (int i=0;i<n;i++) {
beta = beta + r_buffer[i]*r_buffer[i];
}
beta = beta/rtr_buffer;
// Calculate new direction vector
for (int i=0;i<n;i++) {
d_buffer[i] = r_buffer[i] + beta*d_buffer[i];
}
*total_counter = *total_counter+1;
if(*total_counter >= iteration_cutoff) {
// Set total_counter to -1, this indicates failure
*total_counter = -1;
break;
}
}
// Store Wisdom
fftw_export_wisdom_to_filename("cd.wis");
// Free fft alloc'd memory and plans
fftw_destroy_plan(plan_forward_d_buffer);
fftw_destroy_plan(plan_forward_A_vec);
fftw_destroy_plan(plan_backward_Ad_buffer);
fftw_free(A_vec_fft);
fftw_free(d_buffer_fft);
};
这是matlab部分:
% Take FFT of A_vec.
A_vec_fft = fft(A_vec); % Take fft once
% Find initial direction - this is the initial residual
x = zeros(n,1); % search direction
r = zeros(n,1); % residual
d = zeros(n+(n-2),1); % search direction; pad to allow FFT
for i = 1:n
d(i) = b(i);
r(i) = b(i);
end
% Enter CG iterations
total_counter = 0;
rtr_buffer = 0;
alpha = 0;
beta = 0;
Ad_buffer = zeros(n+(n-2),1); % This holds the product of A*d - calculate this once per iteration and using FFT; only 1:n is used
norm_ro = norm(r);
while(norm(r)/norm_ro > 10^-6)
% Find Ad - use fft
Ad_buffer = ifft(A_vec_fft.*fft(d));
% Calculate rtr_buffer
rtr_buffer = r'*r;
% Calculate alpha
alpha = rtr_buffer/(d(1:n)'*Ad_buffer(1:n));
% Calculate new x
x = x + alpha*d(1:n);
% Calculate new residual
r = r - alpha*Ad_buffer(1:n);
% Calculate beta
beta = r'*r/(rtr_buffer);
% Calculate new direction vector
d(1:n) = r + beta*d(1:n);
% Update counter
total_counter = total_counter+1;
end
在时间方面,对于 N = 50000 和 b = 1:n,mex 大约需要 10.5 秒,matlab 大约需要 4.4 秒。我正在使用 R2011b。谢谢