有三个不同的事情,正弦四非,去omp parallel
:
A ) - 算法必须是正确的
B ) - 算法必须有效地使用资源
C ) - 算法在附加间接成本上的花费要少于从运行中获得的费用omp
修复A
) 并在B
) 和C
) 上进行一些小实验后:
人们可能很快就会意识到,在B ) 和C ) 下展示的rand()
处理成本要高得多,任何从天真或更智能的矩阵覆盖映射到资源(这里是单一引擎,因为任何类型的并发都有)的任何好处为了在其所有并发使用中重新传播rand()
随机源的新状态,其成本远远超过它在并发操作的矩阵覆盖中所能提供的成本(加上天真的缓存线不知道矩阵的交叉不会帮助要么)。
最好的结果(没有优化myOMP_SCHEDULE_CHUNKS
):
/*
-O3 private( ..., i, j ) omp single
MATRIX.RAND time: 3 191 [us] 3 446 [us] 3 444 [us] 3 384 [us] 3 173 [us]
MATRIX.SORT time: 96 270 [us] 98 401 [us] 98 423 [us] 95 911 [us] 101 019 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/
全球视野:
/* COMPILE:: -fopenmp
*
* MAY SHELL:: $ export OMP_NUM_THREADS = 3
* $ export OMP_DISPLAY_ENV = 1
* https://stackoverflow.com/questions/47495916/how-to-parallelize-matrix-sorting-for-loop
*/
#include <omp.h>
#define myOMP_SCHEDULE_CHUNKS 5 // OMP schedule( static, chunk ) ~ better cache-line depletion
#define myOMP_THREADS 4
/*
$ ./OMP_matrix_SORT
MATRIX.RAND time: 187 744 [us] 234 729 [us] 174 535 [us] 254 273 [us] 122 983 [us]
MATRIX.SORT time: 1 911 310 [us] 1 898 494 [us] 2 026 455 [us] 1 978 631 [us] 1 911 231 [us] @( 3 ) = OMP_THREADS
MATRIX.RAND time: 6 166 [us] 6 977 [us] 6 722 [us]
MATRIX.SORT time: 2 448 608 [us] 2 264 572 [us] 2 355 366 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 918 [us] 17 551 [us] 7 194 [us]
MATRIX.SORT time: 1 774 883 [us] 1 809 002 [us] 1 786 494 [us] @( 1 ) = OMP_THREADS
MATRIX.RAND time: 7 321 [us] 7 337 [us] 6 698 [us]
MATRIX.SORT time: 2 152 945 [us] 1 900 149 [us] 1 883 638 [us] @( 1 ) = OMP_THREADS
MATRIX.RAND time: 54 198 [us] 67 290 [us] 52 123 [us]
MATRIX.SORT time: 759 248 [us] 769 580 [us] 760 759 [us] 812 875 [us] @( 3 ) = OMP_THREADS
MATRIX.RAND time: 7 054 [us] 6 414 [us] 6 435 [us] 6 426 [us]
MATRIX.SORT time: 687 021 [us] 760 917 [us] 674 496 [us] 705 629 [us] @( 3 ) = OMP_THREADS
-O3
MATRIX.RAND time: 5 890 [us] 6 147 [us] 6 081 [us] 5 796 [us] 6 143 [us]
MATRIX.SORT time: 148 381 [us] 152 664 [us] 184 922 [us] 155 236 [us] 169 442 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
-O3 private( ..., i, j )
MATRIX.RAND time: 6 410 [us] 6 111 [us] 6 903 [us] 5 831 [us] 6 224 [us]
MATRIX.SORT time: 129 787 [us] 129 836 [us] 195 299 [us] 136 111 [us] 161 117 [us] @( 4 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 349 [us] 6 532 [us] 6 104 [us] 6 213 [us]
MATRIX.SORT time: 151 202 [us] 152 542 [us] 160 403 [us] 180 375 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 745 [us] 5 834 [us] 5 791 [us] 7 164 [us] 6 535 [us]
MATRIX.SORT time: 214 590 [us] 214 563 [us] 209 610 [us] 205 940 [us] 230 787 [us] @( 2 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
long GetTime() { // double GetTime()
struct timeval clock;
gettimeofday( &clock, NULL );
return (long)clock.tv_sec * 1000000 // in [us] ( go (long) instead of float )
+ (long)clock.tv_usec; //
/* double rez = (double)clock.tv_sec
* + (double)clock.tv_usec / 1000000;
* // + (double)clock.tv_usec * 0.000001; // NEVER DIV
return rez;
*/
}
void genMatrix( int *A, int N, int M ) { // Generate matrix
register int i, iM, j;
#pragma omp parallel
for ( i = 0; i < N; i++ ) {
iM = i * M;
/* for ( register int i = 0; i < N; i++ ) {
register int iM = i * M;
*/
// #pragma omp parallel // 234 729 [us]
// for ( register int j = 0; j < M; j++ )
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) // 122 983 [us] @( 3 ) = OMP_THREADS ~~~ v/s 6 698 [us] @( 1 ) = OMP_THREADS
// // v/s 5 796 [us] @ NON-OMP
#pragma omp single // ~~ 3 191 [us]
for ( int j = 0; j < M; j++ )
A[iM +j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
// A[i*M+j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
}
}
int main() {
srand( time( NULL ) );
int N = 480; // 4800; ~ 100x faster outputs
int M = 600; // 6000;
int Mb1 = M - 1;
int *A = new int[N*M];
omp_set_num_threads( myOMP_THREADS );
long long int t_start = GetTime();
genMatrix( A, N, M );
long long int t_load = GetTime();
printf( "MATRIX.RAND time: %lld [us]\n", t_load - t_start );
register int thisB,
this1,
next1,
t, i, j;
t_start = GetTime(); // double t_start = GetTime();
// for ( register int k = 0; k < N; k++ ) {
// #pragma omp parallel
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) // schedule( type, chunk ):
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t ) // schedule( type, chunk ):
#pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t, i, j ) // schedule( type, chunk ):
for ( int k = 0; k < N; k++ ) {
thisB = k*M;
if ( omp_get_num_threads() != myOMP_THREADS ) {
printf( "INF: myOMP_THREADS ( == %d ) do not match the number of executed ones ( == %d ) ", myOMP_THREADS, omp_get_num_threads() );
}
//--------------------------------------------------// -------------SORT ROW-k
// for ( register int i = 0; i < Mb1; i++ ) { // < M-1; i++ ) {
// for ( register int j = 0; j < Mb1; j++ ) { // < M-1; j++ ) {
for ( i = 0; i < Mb1; i++ ) {
for ( j = 0; j < Mb1; j++ ) {
this1 = thisB + j,
next1 = this1 + 1;
if ( A[this1] > A[next1] ){ // A[k*M+j ] > A[k*M+j+1] ) {
t = A[this1]; // t = A[k*M+j ];
A[this1] = A[next1]; // A[k*M+j ] = A[k*M+j+1];
A[next1] = t; // A[k*M+j+1] = t;
}
}
}
//--------------------------------------------------// -------------SORT ROW-k
}
t_load = GetTime(); // double t_load = GetTime();
/* Print matrix
//
for ( int i = 0; i < N; i++ ) {
for ( int j = 0; j < M; j++ ) {
printf( "%3d", A[i*M+j] );
}
printf("\n");
}
//
*/
printf( "MATRIX.SORT time: %lld [us] @( %d ) = OMP_THREADS in [ %d ] OMP_SCHEDULE_CHUNKS\n",
t_load - t_start,
myOMP_THREADS,
myOMP_SCHEDULE_CHUNKS
);
// system( "pause" );
}