基于 Floris 的快速功能,我尝试查看是否可以找到一种方法来使用 OpenMP 找到更快的解决方案。我想出了两个函数foo_v2
,foo_v3
它们对于较大的数组foo_v2
更快,与密度无关foo_v3
更快,并且对于更稀疏的数组更快。该函数foo_v2
本质上创建了一个具有宽度的二维数组N*nthreads
以及一个countsa
包含每个线程的计数的数组。用代码更好地解释这一点。下面的代码将遍历所有写入 T 的元素。
for(int ithread=0; ithread<nthreads; ithread++) {
for(int i=0; i<counta[ithread]; i++) {
T[ithread*N/nthread + i]
}
}
该函数foo_v3
根据要求创建一维数组。在所有情况下N
都必须非常大才能克服 OpenMP 开销。下面的代码默认为 256MB,密度M
约为 10%。在我的 4 核 Sandy Bridge 系统上,OpenMP 函数的速度都快了 2 倍以上。如果将密度设置为 50% foo_v2
,则速度仍会提高大约 2 倍,但foo_v3
不再更快。
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int foo_v1(int *M, int *T, const int N) {
int count = 0;
for(int i = 0; i<N; i++) {
T[count] = i;
count += M[i];
}
return count;
}
int foo_v2(int *M, int *T, int *&counta, const int N) {
int nthreads;
#pragma omp parallel
{
nthreads = omp_get_num_threads();
const int ithread = omp_get_thread_num();
#pragma omp single
counta = new int[nthreads];
int count_private = 0;
#pragma omp for
for(int i = 0; i<N; i++) {
T[ithread*N/nthreads + count_private] = i;
count_private += M[i];
}
counta[ithread] = count_private;
}
return nthreads;
}
int foo_v3(int *M, int *T, const int N) {
int count = 0;
int *counta = 0;
#pragma omp parallel reduction(+:count)
{
const int nthreads = omp_get_num_threads();
const int ithread = omp_get_thread_num();
#pragma omp single
{
counta = new int[nthreads+1];
counta[0] = 0;
}
int *Tprivate = new int[N/nthreads];
int count_private = 0;
#pragma omp for nowait
for(int i = 0; i<N; i++) {
Tprivate[count_private] = i;
count_private += M[i];
}
counta[ithread+1] = count_private;
count += count_private;
#pragma omp barrier
int offset = 0;
for(int i=0; i<(ithread+1); i++) {
offset += counta[i];
}
for(int i=0; i<count_private; i++) {
T[offset + i] = Tprivate[i];
}
delete[] Tprivate;
}
delete[] counta;
return count;
}
void compare(const int *T1, const int *T2, const int N, const int count, const int *counta, const int nthreads) {
int diff = 0;
int n = 0;
for(int ithread=0; ithread<nthreads; ithread++) {
for(int i=0; i<counta[ithread]; i++) {
int i2 = N*ithread/nthreads+i;
//printf("%d %d\n", T1[n], T2[i2]);
int tmp = T1[n++] - T2[i2];
if(tmp<0) tmp*=-1;
diff += tmp;
}
}
printf("diff %d\n", diff);
}
void compare_v2(const int *T1, const int *T2, const int count) {
int diff = 0;
int n = 0;
for(int i=0; i<count; i++) {
int tmp = T1[i] - T2[i];
//if(tmp!=0) printf("%i %d %d\n", i, T1[i], T2[i]);
if(tmp<0) tmp*=-1;
diff += tmp;
}
printf("diff %d\n", diff);
}
int main() {
const int N = 1 << 26;
printf("%f MB\n", 4.0*N/1024/1024);
int *M = new int[N];
int *T1 = new int[N];
int *T2 = new int[N];
int *T3 = new int[N];
int *counta;
double dtime;
for(int i=0; i<N; i++) {
M[i] = ((rand()%10)==0);
}
//int repeat = 10000;
int repeat = 1;
int count1, count2;
int nthreads;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) count1 = foo_v1(M, T1, N);
dtime = omp_get_wtime() - dtime;
printf("time v1 %f\n", dtime);
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) nthreads = foo_v2(M, T2, counta, N);
dtime = omp_get_wtime() - dtime;
printf("time v2 %f\n", dtime);
compare(T1, T2, N, count1, counta, nthreads);
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) count2 = foo_v3(M, T3, N);
dtime = omp_get_wtime() - dtime;
printf("time v2 %f\n", dtime);
printf("count1 %d, count2 %d\n", count1, count2);
compare_v2(T1, T3, count1);
}