c++ - 如何并行化循环的矩阵排序？

Question

我正在尝试for(){...}使用 OpenMP 并行化一个循环，该循环采用N“表”的许多“行”N*M并按升序对每一行进行排序。

我添加了#pragma omp parallel,#pragma omp for schedule 指令，但没有看到任何变化，就好像它什么也没做一样。

这是完整的程序：

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>

double GetTime() {
   struct timeval clock;
   gettimeofday(&clock, NULL);
   double rez = (double)clock.tv_sec+(double)clock.tv_usec/1000000;
   return rez;
}

void genMatrix(int *A, int N, int M) {
   // Generate matrix
   for (int i=0; i<N; i++) {
      for (int j=0; j<M; j++) A[i*M+j] = (int)((double)rand()/RAND_MAX*99) + 1;
   }
}

int main() {
   srand(time(NULL));
   int N = 4800;
   int M = 6000;
   int *A = new int[N*M];
   int t, n;
   genMatrix(A, N, M);

      double t_start = GetTime();

       #pragma omp parallel
{
    #pragma omp for schedule     
   for (int k=0; k<N; k++) {
      for (int i=0; i<M-1; i++) {
         for (int j=0; j<M-1; j++) {
            if (A[k*M+j] > A[k*M+j+1]) {
               t = A[k*M+j];
               A[k*M+j] = A[k*M+j+1];
               A[k*M+j+1] = t;
}}}}}
             double t_load = GetTime();
    //    Print matrix

  //  for (int i=0; i<N; i++) {
    //   for (int j=0; j<M; j++) {
      //    printf("%3d", A[i*M+j]);
     //  }
      // printf("\n");
   // }
       printf("Load time: %.2f\n", t_load - t_start);
    system("pause");
}

出了什么问题，在这种情况下我应该如何使用 OpenMP 添加并行化？

另外，不知道为什么，但是当尝试A使用大数字
（如int N = 480;int M = 600;）打印矩阵时，某些值未排序。

是印刷问题吗？

score 1 · Accepted Answer

有三个不同的事情，正弦四非，去`omp parallel`：

A ) - 算法必须是正确的
B ) - 算法必须有效地使用资源
C ) - 算法在附加间接成本上的花费要少于从运行中获得的费用omp

修复`A`) 并在`B`) 和`C`) 上进行一些小实验后：

人们可能很快就会意识到，在B ) 和C ) 下展示的rand()处理成本要高得多，任何从天真或更智能的矩阵覆盖映射到资源（这里是单一引擎，因为任何类型的并发都有）的任何好处为了在其所有并发使用中重新传播rand()随机源的新状态，其成本远远超过它在并发操作的矩阵覆盖中所能提供的成本（加上天真的缓存线不知道矩阵的交叉不会帮助要么）。

最好的结果（没有优化myOMP_SCHEDULE_CHUNKS）：

/*
-O3 private( ..., i, j ) omp single
MATRIX.RAND time:     3 191 [us]     3 446 [us]    3 444 [us]     3 384 [us]      3 173 [us] 
MATRIX.SORT time:    96 270 [us]    98 401 [us]   98 423 [us]    95 911 [us]    101 019 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/

全球视野：

/* COMPILE::    -fopenmp
 * 
 * MAY SHELL::  $ export OMP_NUM_THREADS = 3
 *              $ export OMP_DISPLAY_ENV = 1
 *                                                                       https://stackoverflow.com/questions/47495916/how-to-parallelize-matrix-sorting-for-loop
 */

#include <omp.h>
#define myOMP_SCHEDULE_CHUNKS   5               // OMP schedule( static, chunk ) ~ better cache-line depletion
#define myOMP_THREADS           4

/*
$ ./OMP_matrix_SORT

MATRIX.RAND time:   187 744 [us]   234 729 [us]   174 535 [us]   254 273 [us]   122 983 [us]
MATRIX.SORT time: 1 911 310 [us] 1 898 494 [us] 2 026 455 [us] 1 978 631 [us] 1 911 231 [us] @( 3 ) = OMP_THREADS

MATRIX.RAND time:                                   6 166 [us]     6 977 [us]     6 722 [us]  
MATRIX.SORT time:                               2 448 608 [us] 2 264 572 [us] 2 355 366 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS

MATRIX.RAND time:                                   6 918 [us]    17 551 [us]     7 194 [us]
MATRIX.SORT time:                               1 774 883 [us] 1 809 002 [us] 1 786 494 [us] @( 1 ) = OMP_THREADS

MATRIX.RAND time:                                   7 321 [us]     7 337 [us]     6 698 [us]
MATRIX.SORT time:                               2 152 945 [us] 1 900 149 [us] 1 883 638 [us] @( 1 ) = OMP_THREADS

MATRIX.RAND time:                                  54 198 [us]    67 290 [us]    52 123 [us]
MATRIX.SORT time:                  759 248 [us]   769 580 [us]   760 759 [us]   812 875 [us] @( 3 ) = OMP_THREADS

MATRIX.RAND time:                    7 054 [us]     6 414 [us]     6 435 [us]     6 426 [us]
MATRIX.SORT time:                  687 021 [us]   760 917 [us]   674 496 [us]   705 629 [us] @( 3 ) = OMP_THREADS

-O3                              
MATRIX.RAND time:     5 890 [us]     6 147 [us]     6 081 [us]     5 796 [us]     6 143 [us] 
MATRIX.SORT time:   148 381 [us]   152 664 [us]   184 922 [us]   155 236 [us]   169 442 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS

-O3 private( ..., i, j )
MATRIX.RAND time:     6 410 [us]     6 111 [us]     6 903 [us]     5 831 [us]     6 224 [us]
MATRIX.SORT time:   129 787 [us]   129 836 [us]   195 299 [us]   136 111 [us]   161 117 [us] @( 4 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS

MATRIX.RAND time:                    6 349 [us]     6 532 [us]     6 104 [us]     6 213 [us]
MATRIX.SORT time:                  151 202 [us]   152 542 [us]   160 403 [us]   180 375 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS

MATRIX.RAND time:     6 745 [us]     5 834 [us]     5 791 [us]     7 164 [us]     6 535 [us] 
MATRIX.SORT time:   214 590 [us]   214 563 [us]   209 610 [us]   205 940 [us]   230 787 [us] @( 2 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS

 */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>

long GetTime() {                                    // double GetTime()
   struct timeval clock;
   gettimeofday( &clock, NULL );
   return    (long)clock.tv_sec  * 1000000          // in [us] ( go (long) instead of float )
         +   (long)clock.tv_usec;                   // 
/* double rez = (double)clock.tv_sec
 *            + (double)clock.tv_usec / 1000000;
 *         // + (double)clock.tv_usec * 0.000001;   // NEVER DIV
   return rez;
   */
}

void genMatrix( int *A, int N, int M ) {            // Generate matrix
   register int i, iM, j;

   #pragma omp parallel
   for (              i  = 0; i < N; i++ ) {
                      iM = i * M;

/* for ( register int i  = 0; i < N; i++ ) {
         register int iM = i * M;
         */
      // #pragma omp parallel                                               //  234 729 [us]
      // for ( register int j = 0; j < M; j++ )

      // #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) //  122 983 [us] @( 3 ) = OMP_THREADS ~~~  v/s   6 698 [us] @( 1 ) = OMP_THREADS
      //                                                                    //                                         v/s   5 796 [us] @    NON-OMP
         #pragma omp single                                                 // ~~ 3 191 [us]
         for (          int j = 0; j < M; j++ )
                      A[iM +j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
                   // A[i*M+j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
   }
}

int main() {
    srand( time( NULL ) );
    int  N   = 480;                                 // 4800; ~ 100x faster outputs
    int  M   = 600;                                 // 6000;
    int  Mb1 = M - 1;
    int *A   = new int[N*M];

    omp_set_num_threads( myOMP_THREADS );

    long long int  t_start = GetTime();

    genMatrix( A, N, M );

    long long int  t_load = GetTime();
    printf( "MATRIX.RAND time: %lld [us]\n", t_load - t_start );

    register int thisB,
                 this1,
                 next1,
                 t, i, j;

    t_start = GetTime();                        // double t_start = GetTime();

// for ( register int k = 0; k <  N;   k++ ) {

// #pragma omp parallel
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS )                                           // schedule( type, chunk ):
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t )         // schedule( type, chunk ):
   #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t, i, j )   // schedule( type, chunk ):
   for ( int k = 0; k <  N;   k++ ) {
       thisB =  k*M;

       if ( omp_get_num_threads() != myOMP_THREADS ) {
            printf( "INF: myOMP_THREADS ( == %d ) do not match the number of executed ones ( == %d ) ", myOMP_THREADS, omp_get_num_threads() );
       }
    //--------------------------------------------------// -------------SORT ROW-k
    // for (      register int i = 0; i <  Mb1; i++ ) { //    < M-1; i++ ) {
    //     for (  register int j = 0; j <  Mb1; j++ ) { //    < M-1; j++ ) {
       for (                   i = 0; i <  Mb1; i++ ) {
           for (               j = 0; j <  Mb1; j++ ) {      

               this1 = thisB + j,
               next1 = this1 + 1;

               if ( A[this1] >  A[next1] ){             // A[k*M+j  ] >  A[k*M+j+1] ) {
                    t         = A[this1];               // t           = A[k*M+j  ];
                    A[this1]  = A[next1];               // A[k*M+j  ]  = A[k*M+j+1];
                    A[next1]  = t;                      // A[k*M+j+1]  = t;
               }
           }
       }
    //--------------------------------------------------// -------------SORT ROW-k
   }

   t_load = GetTime();                                  // double t_load = GetTime();
/* Print matrix
  //
   for (    int i = 0; i < N; i++ ) {
      for ( int j = 0; j < M; j++ ) {
            printf( "%3d", A[i*M+j] );
      }
      printf("\n");
   }
 //
   */
   printf( "MATRIX.SORT time: %lld [us] @( %d ) = OMP_THREADS in [ %d ] OMP_SCHEDULE_CHUNKS\n",
            t_load - t_start,
            myOMP_THREADS,
            myOMP_SCHEDULE_CHUNKS
            );
// system( "pause" );
}

c++ - 如何并行化循环的矩阵排序？

1 回答 1

有三个不同的事情，正弦四非，去omp parallel：

修复A) 并在B) 和C) 上进行一些小实验后：

Related

Reference

有三个不同的事情，正弦四非，去`omp parallel`：

修复`A`) 并在`B`) 和`C`) 上进行一些小实验后：