0

实际上,我有一个用于矩阵乘法的可视 c# 程序,它使用 i3 处理器中的所有逻辑内核,但我想知道如何在 c 中实现及其解释。程序是,

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace MatrixMultiplication
{
    internal class Program
    {
        #region Sequential_Loop

        private static void MultiplyMatricesSequential(double[,] matA, double[,] matB,
                                                       double[,] result)
        {
            int matACols = matA.GetLength(1);
            int matBCols = matB.GetLength(1);
            int matARows = matA.GetLength(0);

            for (int i = 0; i < matARows; i++)
            {
                for (int j = 0; j < matBCols; j++)
                {
                    for (int k = 0; k < matACols; k++)
                    {
                        result[i, j] += matA[i, k]*matB[k, j];
                    }
                }
            }
        }

        #endregion

        #region Parallel_Loop

        private static void MultiplyMatricesParallel(double[,] matA, double[,] matB, double[,] result)
        {
            int matACols = matA.GetLength(1);
            int matBCols = matB.GetLength(1);
            int matARows = matA.GetLength(0);

            // A basic matrix multiplication.
            // Parallelize the outer loop to partition the source array by rows.
            Parallel.For(0, matARows, i =>
                                          {
                                              for (int j = 0; j < matBCols; j++)
                                              {
                                                  // Use a temporary to improve parallel performance.
                                                  double temp = 0;
                                                  for (int k = 0; k < matACols; k++)
                                                  {
                                                      temp += matA[i, k]*matB[k, j];
                                                  }
                                                  result[i, j] = temp;
                                              }
                                          }); // Parallel.For
        }

        #endregion

        #region Main

        private static void Main(string[] args)
        {
            // Set up matrices. Use small values to better view 
            // result matrix. Increase the counts to see greater 
            // speedup in the parallel loop vs. the sequential loop.
            int colCount = 800;
            int rowCount = 800;
            int colCount2 = 800;
            double[,] m1 = InitializeMatrix(rowCount, colCount);
            double[,] m2 = InitializeMatrix(colCount, colCount2);
            var result = new double[rowCount,colCount2];

            // First do the sequential version.
            Console.WriteLine("Executing sequential loop...");
            var stopwatch = new Stopwatch();
            stopwatch.Start();

            MultiplyMatricesSequential(m1, m2, result);
            stopwatch.Stop();
            Console.WriteLine("Sequential loop time in milliseconds: {0}", stopwatch.ElapsedMilliseconds);

            // For the skeptics.
            OfferToPrint(rowCount, colCount2, result);

            // Reset timer and results matrix. 
            stopwatch.Reset();
            result = new double[rowCount,colCount2];

            // Do the parallel loop.
            Console.WriteLine("Executing parallel loop...");
            stopwatch.Start();
            MultiplyMatricesParallel(m1, m2, result);
            stopwatch.Stop();
            Console.WriteLine("Parallel loop time in milliseconds: {0}", stopwatch.ElapsedMilliseconds);
            OfferToPrint(rowCount, colCount2, result);

            // Keep the console window open in debug mode.
            Console.WriteLine("Press any key to exit.");
            Console.ReadKey();
        }

        #endregion

        #region Helper_Methods

        private static double[,] InitializeMatrix(int rows, int cols)
        {
            var matrix = new double[rows,cols];

            var r = new Random();
            for (int i = 0; i < rows; i++)
            {
                for (int j = 0; j < cols; j++)
                {
                    matrix[i, j] = r.Next(100);
                }
            }
            return matrix;
        }

        private static void OfferToPrint(int rowCount, int colCount, double[,] matrix)
        {
            Console.WriteLine("Computation complete. Print results? y/n");
            char c = Console.ReadKey().KeyChar;
            if (c == 'y' || c == 'Y')
            {
                Console.WindowWidth = 180;
                Console.WriteLine();
                for (int x = 0; x < rowCount; x++)
                {
                    Console.WriteLine("ROW {0}: ", x);
                    for (int y = 0; y < colCount; y++)
                    {
                        Console.Write("{0:#.##} ", matrix[x, y]);
                    }
                    Console.WriteLine();
                }
            }
        }

        #endregion
    }
}
4

1 回答 1

1

矩阵乘法是一个“令人愉快的并行”问题,即它可以简单地并行化,因为结果数组中的每个单元格都独立于任何其他单元格的值。

您的代码中的并行解决方案可以通过对不在行上的单元格进行分区来进一步并行化,例如通过将结果矩阵中的每个单元格编号为 j + i*matBCols (注意我没有检查此代码,我可能已经切换了一些索引,如果发现错误,请发表评论):

private static void MultiplyMatricesParallel(double[,] matA, double[,] matB, double[,] result)
{
    int matACols = matA.GetLength(1);
    int matBCols = matB.GetLength(1);
    int matARows = matA.GetLength(0);

    // A basic matrix multiplication.
    // Parallelize the outer loop to partition the source array by rows.
    Parallel.For(0, matARows*matBCols, ij =>
                                  {
                                      i = ij / matBCols;
                                      j = ij % matBCols;

                                      // Use a temporary to improve parallel performance.
                                      double temp = 0;
                                      for (int k = 0; k < matACols; k++)
                                      {
                                          temp += matA[i, k]*matB[k, j];
                                      }
                                      result[i, j] = temp;

                                  }); // Parallel.For
}

在 C 中进行这项工作的简单方法是为结果矩阵中的每个单元格创建一个线程,但这将是浪费和次优的,这是因为 Parallel.For 实际上是在场景下进行一些计算以优化计算速度。

在最好的情况下,我们希望对阵列进行分区,以便每个核心获得阵列乘法的同等份额。在包含 Parallel.For 每个单元计算(在我的示例中)或行计算(在原始示例中)的任务并行库 (TPL) 中,都将其转换为一个任务。Parallel.For 考虑了内核数量,并为每个内核分配工作线程,试图保持内核之间的工作平衡和最少的线程数量。在具有 2 个内核的完美场景中,这将是两个线程,每个线程都有一半的矩阵乘法。然而,TPL 内置了动态平衡。

例如,如果一个核心变得繁忙(例如运行另一个进程)或一个工作线程变得阻塞(例如等待来自虚拟内存的块),那么 TPL 将产生更多线程并重新平衡工作负载。

你可以在这里阅读。

我想说的是,在 C 中复制 Parallel.For 的工作并非易事。对于矩阵乘法,您可以通过放弃任务的动态重新分配来获得良好的传真。只需为每个线程创建与 CPU 内核一样多的线程,并在它们之间平均划分矩阵。

在 Windows 中,您可以通过以下方式获取核心数量:(GetSystemInfo或参见此处CreateThread了解其他选项),并使用和创建具有核心亲和性的线程SetThreadAffinityMask

于 2013-10-15T05:09:44.910 回答