我正在尝试将 c++ 中的 for 循环与 openMP 并行化。我有许多需要用 zheevr 取幂的矩阵(Matrix 类)。该实现提供了数据竞赛。
并行化的 for 循环是
/* in main */
#pragma omp parallel for shared(Aexp_omp, A, Z, W) private(j) firstprivate(idt)
for (j=0; j< nMat; ++j) {
Aexp_omp[j] = EigenMethod(A[j],-idt,&(Z[j]),W[j]);
}
这里, Aexp_omp
, A
,Z
都是类的数组Matrix
。确切的行似乎是对 zheevr in 的第二次调用EigenMethod
。当 #pragma omp critical 没有被注释掉时,代码会正确执行(但是在这里放置 pragma omp critical 似乎违背了并行化此代码的目的)。
我不明白数据竞争来自哪里,如果我没记错的话,循环中清除的所有变量都是线程私有的?
关于为什么会发生这种情况的任何见解?我是否错误地清除了pragma omp parallel for
代码的一部分?
非常感谢您的帮助。
编辑:完整代码
为了提供所有信息,这里是完整的代码(有点长,我很抱歉)。它使用makefile在我的机器上编译
omp_matrix: omp_matrix.cpp
g++ -fopenmp -lgsl -lgslcblas -llapack -lblas -lm -o omp_matrix omp_matrix.cpp
由于我怀疑数据竞争,它在运行时会出现随机行为。代码必须是文件,一个带有 main(和一些额外的命名空间)和 Matrix 类的文件。
/* omp_matrix.cpp */
#include <iostream>
#include <omp.h>
#include <cstdio>
#include <ctime>
#include <string>
#include <cmath>
#include <vector>
#include <complex>
#include <limits>
#include <cstdlib>
const char Trans[] = {'N','T','C'};
const char UpLo[] = {'U','L'};
const char Jobz[] = {'V','N'};
const char Range[] = {'A','V','I'};
#ifdef __cplusplus
extern "C" {
#endif
void zgemm_(const char* TransA, const char* TransB, const size_t* M, const size_t* N, const size_t* K, const std::complex<double>* alpha, const std::complex<double>* A, const size_t* lda, const std::complex<double>* B, const size_t* ldb, const std::complex<double>* beta, std::complex<double>* C, size_t* ldc);
int zheevr_(const char* jobz, const char* range, const char* uplo, const size_t* n, std::complex<double>* a, size_t* lda, double* vl, double* vu, size_t* il, size_t* iu, double* abstol, size_t* m, double* w, std::complex<double>* z, size_t* ldz, int* isuppz, std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info);
#ifdef __cplusplus
}
#endif
#include "Matrix.hpp"
namespace MOs {
//Identity Matrix
template <class T>
inline void Identity(Matrix<T>& I){
size_t dim = I.GetRows();
if (dim != I.GetColumns())
exit(1);
for(size_t i=0; i<dim; i++)
for(size_t j=0; j<dim; j++)
I(i,j) = i == j ? T(1) : T(0);
return;
}
// The H.O. Annilation operator
template <class T>
inline void Destroy(Matrix<T>& a){
size_t dim = a.GetRows();
if (dim != a.GetColumns())
exit(1);
for(size_t i=0; i<dim; i++)
for(size_t j=0; j<dim; j++)
a(i,j) = j == i+1 ? std::sqrt(T(j)) : T(0);
return;
}
//Take the Hermitian conjugate of a complex Matrix
inline Matrix<std::complex<double> > Dagger(const Matrix<std::complex<double> >& A){
size_t cols = A.GetColumns(), rows= A.GetRows();
Matrix<std::complex<double> > temp(cols,rows);
for (size_t i=0; i < rows; i++)
for(size_t j=0; j < cols; j++)
temp(j,i) = conj(A(i,j));
return temp;
}
template <class T>
inline Matrix<T> TensorProduct(const Matrix<T>& A, const Matrix<T>& B){
size_t rows1 = A.GetRows(), rows2 = B.GetRows(), cols1 = A.GetColumns(), cols2 = B.GetColumns();
size_t rows_new = rows1*rows2, cols_new = cols1*cols2, n, m;
Matrix<T> temp(rows_new,cols_new);
for(size_t i=0; i<rows1; i++)
for(size_t j=0; j<cols1; j++)
for(size_t p=0; p<rows2; p++)
for(size_t q=0; q<cols2; q++){
n = i*rows2+p;
m = j*cols2+q; // 0 (0 + 1) + 1*dimb=2 + (0 + 1 ) (j*dimb+q)
temp(n,m) = A(i,j)*B(p,q);
}
return temp;
}
}
Matrix<std::complex<double> > EigenMethod(const Matrix<std::complex<double> >& Ain, std::complex<double> x, Matrix<std::complex<double> >* Z=NULL, double *W=NULL );
using namespace std;
int main(int argc, char const *argv[]) {
// dim is the dimension of the system
size_t dimQ = 2;
size_t dim = dimQ*dimQ*dimQ;
// Define the matrices used to build the matrix to be exponentiated
Matrix<complex<double> > o(dimQ,dimQ), od(dimQ,dimQ), nq(dimQ,dimQ);
MOs::Destroy(o);
od = MOs::Dagger(o); nq=od*o;
Matrix<complex<double> > IdentQ(dimQ, dimQ), Hd(dim, dim);
MOs::Identity(IdentQ);
Hd = 0.170*MOs::TensorProduct(MOs::TensorProduct(od,IdentQ),o) + 0.124* MOs::TensorProduct(MOs::TensorProduct(IdentQ,od),o);
complex<double> idt = complex<double>(0.0,0.2); // time unit multiplied by i
size_t nMat = 2;
double* c = new double[nMat];
c[0] = 0.134;
c[1] = -0.326;
// Decalre matrices and allocate memory for them
Matrix<complex<double> >* A = new Matrix<complex<double> >[nMat];
Matrix<complex<double> >* Aexp = new Matrix<complex<double> >[nMat];
Matrix<complex<double> >* Aexp_omp = new Matrix<complex<double> >[nMat];
for (size_t k = 0; k < nMat; ++k)
A[k] = c[k]*Hd;
// METHOD 1: Serial. Exponentiate one matrix after another
double start_s = omp_get_wtime();
for (size_t k = 0; k < nMat; ++k)
Aexp[k] = EigenMethod(A[k],-idt);
double end_s = omp_get_wtime();
cout << "Aexp[0] = \n" << Aexp[0] << endl;
cout << "Aexp[1] = \n" << Aexp[1] << endl;
// METHOD 2: Parallel. Exponentiate all matrices at the same time
// THIS DOES NOT WORK. Data race condition in EigenMetod?
Matrix<complex<double> >* Z = new Matrix<complex<double> >[nMat];
double** W = new double*[nMat];
for (size_t k = 0; k < nMat; ++k) {
Z[k].initialize(dim,dim);
W[k] = new double[dim];
}
size_t j;
double start_p1 = omp_get_wtime( );
#pragma omp parallel for shared(Aexp_omp,A,Z,W) private(j) firstprivate(idt)
for (j=0; j< nMat; ++j) {
Aexp_omp[j] = EigenMethod(A[j],-idt,&(Z[j]),W[j]);
}
double end_p1 = omp_get_wtime( );
cout << "Done" << endl;
cout << "Aexp_omp[0] = \n" << Aexp_omp[0] << endl;
cout << "Aexp_omp[1] = \n" << Aexp_omp[1] << endl;
cout << "Serial time: " << end_s - start_s << ", parallel time method 2: " << end_p1-start_p1 << endl;
delete [] c;
delete [] A;
delete [] Aexp;
delete [] Aexp_omp;
} /* end of int main */
Matrix<complex<double> > EigenMethod(const Matrix<complex<double> >& A, complex<double> x, Matrix<complex<double> >* Z, double *W){
bool returnZ(Z!=NULL), returnW(W!=NULL); // flags to see if the function received valid Z and W adresses
//if (MOs::IsHermitian(A)==0) UFs::MyError("Routine ExpM::EigenMethod: The input Matrix is not Hermitian.");
size_t N = A.GetRows(), LDA=A.GetLD(), IL, IU, M, LDZ=N;
double VL, VU, ABSTOL=numeric_limits<double>::min();
complex<double>* WORK;
double* RWORK;
int* IWORK;
//finding the optimal work sizes
int LWORK=-1, LRWORK=-1, LIWORK=-1, ok=0;
WORK = new std::complex<double>[1];
RWORK = new double[1];
IWORK = new int[1];
// ZHEEV computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian Matrix A.
// Jobz[0] = V i.e. compute eigenvalues and eigen-vectors, Range[0] = A i.e. all eigen values will be found, UpLo[1] = L i.e. lower triangle of matrix is stored
zheevr_ (&Jobz[0],&Range[0],&UpLo[1],&N,NULL,&LDA,&VL,&VU,&IL,&IU,&ABSTOL,&M,NULL,NULL,&LDZ,NULL,WORK,&LWORK,RWORK,&LRWORK,IWORK,&LIWORK,&ok);
//Setting the optimal workspace
LWORK = (int)real(WORK[0]);
LRWORK = (int)RWORK[0];
LIWORK = IWORK[0];
delete [] WORK;
delete [] RWORK;
delete [] IWORK;
//Running the algorithim
if (!returnZ) Z = new Matrix<std::complex<double> >(LDZ,LDZ);
if (!returnW) W = new double[LDA];
Matrix<std::complex<double> > temp(A);
int *ISUPPZ;
ISUPPZ = new int[2*N];
WORK = new std::complex<double>[LWORK];
RWORK = new double[LRWORK];
IWORK = new int[LIWORK];
//#pragma omp critical
//{
// THIS LINE IS DOING SOMETHING WRONG
zheevr_ (&Jobz[0],&Range[0],&UpLo[1],&N,temp.GetMat(),&LDA,&VL,&VU,&IL,&IU,&ABSTOL,&M,W,Z->GetMat(),&LDZ,ISUPPZ,WORK,&LWORK,RWORK,&LRWORK,IWORK,&LIWORK,&ok);
//}
if (ok!=0) {
cerr << "Routine EigenMethod: An error occured in geting the diagonalization of A" << endl;
exit(1);
}
//Getting the Diag*Dagger(Z), this is where the exponentiation is done using the eigenvalues
for (size_t i=0; i < N; i++)
for(size_t j=0; j < N; j++)
temp(i,j) = exp(x*W[i])*conj((*Z)(j,i));
temp = (*Z)*temp;
if(!returnW) delete [] W;
if(!returnZ) delete Z;
delete [] ISUPPZ;
delete [] WORK;
delete [] RWORK;
delete [] IWORK;
return temp;
}
接下来是类矩阵
#ifndef Matrix_h
#define Matrix_h
enum OutputStyle {Column, List, Array};
template <class T>
class Matrix {
//friend functions get to use the private varibles of the class as well as have different classes as inputs
template <class S>
friend std::ostream& operator << (std::ostream& output, const Matrix<S>& A){
for (size_t i=0; i<A.rows_; i++){
for (size_t j=0; j<A.cols_; j++){
output << A.mat_[j*A.rows_ +i] << "\t";
}
output << std::endl;
}
return output;
}
// overloads beta*A
template <class S1, class S2>
friend Matrix<S1> operator*(const S2& beta, const Matrix<S1>& A){
size_t rows= A.rows_, cols = A.cols_;
Matrix<S1> temp(rows,cols);
for (size_t i=0; i < rows; i++)
for(size_t j=0; j < cols; j++)
temp(i,j) = beta*A(i,j);
return temp;
}
friend Matrix<std::complex<double> > operator*(const Matrix<std::complex<double> >& A, const Matrix<std::complex<double> >& B) {
static Matrix<std::complex<double> > C;
C.initialize(A.rows_,B.cols_);
std::complex<double> alpha =1.0, beta =0.0;
zgemm_(&Trans[0], &Trans[0], &A.rows_, &B.cols_, &A.cols_, &alpha, A.mat_, &A.LD_, B.mat_, &B.LD_, &beta, C.mat_, &C.LD_);
return C;
}
public:
//Construtors
Matrix() : rows_(0), cols_(0), size_(0), LD_(0), outputstyle_(Array){ mat_=0;}
Matrix(size_t rows, size_t cols) : rows_(rows), cols_(cols), size_(rows*cols), LD_(rows), outputstyle_(Array), mat_(new T [size_]){}
Matrix(const Matrix<T>& rhs) : rows_(rhs.rows_), cols_(rhs.cols_), size_(rhs.size_), LD_(rows_), outputstyle_(rhs.outputstyle_), mat_(new T [size_]) {
for(size_t p = 0; p < size_; p++)
mat_[p] = rhs.mat_[p];
}
//Initialize an empty Matrix() to Matrix(size_t rows, size_t cols)
inline void initialize(size_t rows, size_t cols) {
if (rows_ != rows || cols_ != cols ) {
if (mat_ != 0) delete [] (mat_);
rows_ = rows;
cols_ = cols;
size_ = rows_*cols_;
LD_ = rows;
mat_ = new T[size_];
}
}
//Destructor
virtual ~Matrix() {if (mat_ != 0) delete[] (mat_);}
//Assignment operator
inline Matrix<T>& operator=(const Matrix<T>& rhs){
if (rows_ != rhs.rows_ || cols_ != rhs.cols_ ) {
if (mat_ != 0) delete [] (mat_);
rows_=rhs.rows_;
cols_=rhs.cols_;
size_=rows_*cols_;
LD_=rhs.LD_;
mat_= new T[size_];
}
for (size_t p=0; p < size_; p++)
mat_[p] = T(rhs.mat_[p]);
return *this;
}
inline T& operator()(size_t i, size_t j){
if (i >= rows_ || j >= cols_) exit(1);
return mat_[j*rows_+i];
}
inline T operator()(size_t i, size_t j) const{
if (i >= rows_ || j >= cols_) exit(1);
return mat_[j*rows_+i];
}
//overloading functions.
inline Matrix<T> operator+(const Matrix<T>& A){
if(rows_ != A.rows_ || cols_ != A.cols_)
exit(1);
Matrix<T> temp(rows_,cols_);
for (unsigned int p=0; p < size_; p++)
temp.mat_[p] = mat_[p] + A.mat_[p];
return temp;
}
//Member Functions
inline size_t GetColumns() const { return cols_; }
inline size_t GetRows() const { return rows_; }
inline size_t GetLD() const { return LD_; }
inline size_t size() const { return size_; }
T* GetMat() const { return mat_; }
protected:
size_t rows_, cols_, size_, LD_;
//rows_ and cols_ are the rows and columns of the Matrix
//size_ = rows*colums dimensions of the vector representation
//LD is the leading dimeonsion and for Column major order is in general eqaul to rows
enum OutputStyle outputstyle_;
T * mat_;
};
#endif /* Matrix_h */