我已经为矩阵向量乘法编写了代码。矩阵根据线程数被划分为行块,每个块乘以向量,向量存储在线程私有的数组中。但是我的加速很差。对于大小为 16 X 16 的矩阵,它小于 1。
这可能是因为我将我的矩阵和向量在外部声明为共享变量,并且当每个线程尝试从矩阵和向量中读取值时可能会导致竞争条件/错误共享?
我对虚假共享和竞争条件有点困惑。
#include <stdio.h>
#include <omp.h>
#include <stdlib.h>
#define SIZE 128 // The size should be divisible by thenumber of threads
int main(int argc, char *argv[]) {
int thread_count = strtol(argv[1],NULL,10);
// Declare the variables
int i,j;
long A[SIZE][SIZE], b[SIZE],V[SIZE]={0};
//long Vect[SIZE]={0};
double start, end;
// Generate a matrix of size mxm
for (i=0; i<SIZE; i++)
{ for (j=0; j<SIZE; j++)
A[i][j] = i+j;
}
printf("The Matrix is:\n");
// Print the Matrix
for (i=0; i<SIZE; i++)
{ for (j=0; j<SIZE; j++)
{
printf("%12ld", A[i][j]);
}
printf("\n");
}
// Generate a vector of size m
for (i=0; i<SIZE; i++)
b[i] = i;
printf("The vector is: \n");
// Print a vector
for (i=0; i<SIZE; i++)
printf("%12ld\n", b[i]);
start = omp_get_wtime();
//omp_set_num_threads(NUM_THREADS);
#pragma omp parallel num_threads(thread_count)
{
int i,j,k, id, nthrds;
long Vect[SIZE]={0};
id = omp_get_thread_num();
nthrds = omp_get_num_threads();
for (i=id*SIZE/nthrds; i<(id*SIZE/nthrds + SIZE/nthrds); i++)
{ Vect[i] = 0;
{
for (j=0; j<SIZE; j++)
Vect[i] += A[i][j]*b[j];
}
}
#pragma omp critical
{
for (k=0; k<SIZE; k++)
V[k] += Vect[k];
}
}
end = omp_get_wtime();
printf("The vector obtained after multiplication is:\n");
for (i=0; i<SIZE; i++)
printf("%12ld\n", V[i]);
printf("The time taken for calculation is: %lf\n", end - start);
return 0;
}