c# - Parallel.For... 的 5 倍性能在双核上？

Question

当我遇到一个有趣的结果时，我正在做一些有趣的实验计算：

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 19636ms
For Loop: 12612ms
Parallel.For Loop: 3835ms

这不是我所期望的。

系统：Windows 7 64、i3 2120【双核、4线程】、Visual Studio 2010。

构建：优化开启，发布模式 [无调试器]，32 位。

其次是令人失望的 64 位性能。虽然它在比率方面更符合我的预期，但它通过全面减慢来实现这一点。

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 23409ms
For Loop: 24373ms
Parallel.For Loop: 6839ms

计算很简单：对于索引 x 和 y，找到最接近的 Vector3 并将其存储在二维数组中。

如果你敢的话，这个问题是试图解释为什么内联 for 循环如此缓慢。解释 64 位版本缺乏性能的奖励积分。

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class Program
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while (true)
            {
                Console.WriteLine("Starting");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for (int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Stopwatch.StartNew();
                for (int x = 0; x < textureSize; x++)
                    for (int y = 0; y < textureSize; y++)
                    {
                        var targetPos = new Vector3(x, y, 0);
                        var nearestV3 = pointCloud[0];
                        var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

                        for (int i = 1; i < numPoints; i++)
                        {
                            var currentV3 = pointCloud[i];
                            var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                            if (currentV3Distance < nearestV3Distance)
                            {
                                nearestV3 = currentV3;
                                nearestV3Distance = currentV3Distance;
                            }
                        }
                        result1[x, y] = nearestV3;
                    }
                sw1.Stop();

                var sw2 = Stopwatch.StartNew();
                for (int x = 0; x < textureSize; x++)
                    for (int y = 0; y < textureSize; y++)
                        Computation(pointCloud, result2, x, y);
                sw2.Stop();


                var sw3 = Stopwatch.StartNew();

                Parallel.For(0, textureSize, x =>
                {
                    for (int y = 0; y < textureSize; y++)
                        Computation(pointCloud, result3, x, y);
                });
                sw3.Stop();

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine(); Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                    if (!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

            for (int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                if (currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }

        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }

            public static Vector3 operator -(Vector3 a, Vector3 b)
            {
                return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
            }

            public float sqrMagnitude()
            {
                return x * x + y * y + z * z;
            }

            public float DistanceToPoint(Vector3 point)
            {
                return (this - point).sqrMagnitude();
            }
        }
    }
}

更新：感谢Drew Marsh的努力，我们现在有了这个超级优化的版本，它内联了所有 V3 操作。

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class RevisedProgram
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while (true)
            {
                Console.WriteLine("Starting REVISED");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for (int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Inline(pointCloud, result1);

                var sw2 = NotInline(pointCloud, result2);

                var sw3 = Parallelized(pointCloud, result3);

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine();
                Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
        {
            var sw3 = Stopwatch.StartNew();

            Parallel.For(0, textureSize, x =>
            {
                for (int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result3, x, y);
            });
            sw3.Stop();
            return sw3;
        }

        private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
        {
            var sw2 = Stopwatch.StartNew();
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result2, x, y);
            sw2.Stop();
            return sw2;
        }

        private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
        {
            var sw1 = Stopwatch.StartNew();
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                {
                    var targetPos = new Vector3(x, y, 0);
                    var nearestV3 = pointCloud[0];
                    Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);
                    var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

                    for (int i = 1; i < numPoints; i++)
                    {
                        var currentV3 = pointCloud[i];
                        Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
                        var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                        if (currentV3Distance < nearestV3Distance)
                        {
                            nearestV3 = currentV3;
                            nearestV3Distance = currentV3Distance;
                        }
                    }
                    result1[x, y] = nearestV3;
                }
            sw1.Stop();
            return sw1;
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                    if (!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);

            var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

            for (int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
                var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                if (currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }


        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }
        }
    }
}

它给出了以下结果：

x86

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 3820ms
For Loop: 3962ms
Parallel.For Loop: 1681ms

x64

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 10978ms
For Loop: 10924ms
Parallel.For Loop: 3073ms

所以好消息是我们可以大大提高这段代码的性能——并且让单线程版本的运行速度与它的并行表亲保持一致。

坏消息是这意味着完全放弃 x64 并手动内联所有数学。

在这个阶段，我对编译器的性能感到非常失望——我希望它们会好得多。

结论

这是令人沮丧和悲伤的......虽然我们真的不知道为什么我们可以对它是由愚蠢的编译器引起的做出有根据的猜测。只需将编译器从 x64 更改为 x86 并进行一些手动内联，24 秒到 3.8 秒并不是我所期望的。然而，我已经完成了我正在编写的概念证明，并且由于一个简单的空间散列，我可以在 0.7 秒内计算一个 1024 x 1024 的图像，其中包含 70,000 个“点”——比我原来的 x64 场景快约 340000%，并且没有螺纹或内衬。因此，我已经接受了一个答案——迫切需要已经消失了，尽管我仍在研究这个问题。

该代码可在此处和此处获得- 它会生成一个很好的 Voronoi 图作为副作用：P

score 8 · Accepted Answer

来自 8 核 i7、Win7、x64 的所有数据

令人惊讶的是，您肯定会获得 5 倍。正如你所写的，这个测试的一个问题是你把所有三种方法都放在你的 Main 方法中，这迫使编译器必须创建 gobblygook 并保持同步以满足 in 中使用的闭包的Parallel.For需求内联方法的方式。如果您按以下方式分解工作，您将看到所有三种实现的性能明显更快......至少对于 x86：

x86 之前：

For Loop (Inline): 24313ms 
For Loop: 25236ms 
Parallel.For Loop: 3840ms

x86 之后：

For Loop (Inline): 13007ms
For Loop: 13013ms
Parallel.For Loop: 2208ms

因此，查看我的 x86 Parallel.For 结果，您会看到它的扩展速度约为 5.9 倍，并且每个版本在隔离时都快得多。

接下来，有趣的是，在进行同样的更改之后，x64 绝对没有任何收益。事实上，在 3 次测试中的 2 次测试中，每次运行的结果都略高一点。

x64 之前

For Loop (Inline): 24222ms
For Loop: 25197ms
Parallel.For Loop: 3810ms

x64 之后

For Loop (Inline): 25302ms
For Loop: 25209ms
Parallel.For Loop: 3821ms

我没有直接的答案为什么 x64 会如此糟糕，除了人们一直想出这样的代码，这使得 x64 JIT 看起来很糟糕，所以也许其他人可以插话。

也就是说，在这样的实现中，您可能还需要考虑另一件事：缓存行失效。这里有一篇很棒的 MSDN 文章由@StephenToub 撰写，解释了这一切。TL;DR; 就是这样，因为您的所有数据都存储在一个数组中并且存在差异。具有不同本地 (L2) 缓存的内核将修改该阵列的部分内容，以便将数据与与其重叠的其他内核同步。如果部分差异。正在处理的内核太靠近了，您最终会得到很多这样的同步，这会影响您的并行收益。这篇文章展示了一种技术，您实际上在工作数组中分配了足够的额外空间，足以分隔包含您将要处理的数据的实际部分，这样当这些核心处理数据时，它们不必使另一个无效核心。for 循环，而不是更接近 8 倍。我敢打赌，如果您努力解决任何缓存行失效问题，您可以再从中挤出 10% 以上。请记住，在设置和协调并行工作时总会有一些开销，因此您永远不会得到 100% 的完美。

这是您的程序的修订版本，每种方法都分解为单独的方法：

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class RevisedProgram
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while(true)
            {
                Console.WriteLine("Starting REVISED");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for(int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Inline(pointCloud, result1);

                var sw2 = NotInline(pointCloud, result2);


                var sw3 = Parallelized(pointCloud, result3);

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine();
                Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
        {
            var sw3 = Stopwatch.StartNew();

            Parallel.For(0, textureSize, x =>
            {
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result3, x, y);
            });
            sw3.Stop();
            return sw3;
        }

        private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
        {
            var sw2 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result2, x, y);
            sw2.Stop();
            return sw2;
        }

        private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
        {
            var sw1 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                {
                    var targetPos = new Vector3(x, y, 0);
                    var nearestV3 = pointCloud[0];
                    var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

                    for(int i = 1; i < numPoints; i++)
                    {
                        var currentV3 = pointCloud[i];
                        var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                        if(currentV3Distance < nearestV3Distance)
                        {
                            nearestV3 = currentV3;
                            nearestV3Distance = currentV3Distance;
                        }
                    }
                    result1[x, y] = nearestV3;
                }
            sw1.Stop();
            return sw1;
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    if(!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

            for(int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                if(currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }

        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }

            public static Vector3 operator -(Vector3 a, Vector3 b)
            {
                return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
            }

            public float sqrMagnitude()
            {
                return x * x + y * y + z * z;
            }

            public float DistanceToPoint(Vector3 point)
            {
                return (this - point).sqrMagnitude();
            }
        }
    }
}

更新：

根据冯元指出的 x64 JIT 未内联的方法，您可以更改程序以内联计算，并从 x64 版本中获得比 x86 版本更好的性能。这显然很糟糕，但这是我以前见过 x64 JIT 破坏的那种东西。这是新的数字：

内联 x64 后：

For Loop (Inline): 19032ms
For Loop: 19209ms
Parallel.For Loop: 3015ms

代码的内联版本：

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class RevisedProgram
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while(true)
            {
                Console.WriteLine("Starting REVISED");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for(int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Inline(pointCloud, result1);

                var sw2 = NotInline(pointCloud, result2);


                var sw3 = Parallelized(pointCloud, result3);

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine();
                Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
        {
            var sw3 = Stopwatch.StartNew();

            Parallel.For(0, textureSize, x =>
            {
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result3, x, y);
            });
            sw3.Stop();
            return sw3;
        }

        private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
        {
            var sw2 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result2, x, y);
            sw2.Stop();
            return sw2;
        }

        private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
        {
            var sw1 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                {
                    var targetPos = new Vector3(x, y, 0);
                    var nearestV3 = pointCloud[0];
                    Vector3 temp1 = nearestV3 - targetPos;
                    var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

                    for(int i = 1; i < numPoints; i++)
                    {
                        var currentV3 = pointCloud[i];
                        Vector3 temp2 = currentV3 - targetPos;
                        var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                        if(currentV3Distance < nearestV3Distance)
                        {
                            nearestV3 = currentV3;
                            nearestV3Distance = currentV3Distance;
                        }
                    }
                    result1[x, y] = nearestV3;
                }
            sw1.Stop();
            return sw1;
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    if(!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            Vector3 temp1 = nearestV3 - targetPos;
            var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

            for(int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                Vector3 temp2 = currentV3 - targetPos;
                var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                if(currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }

        private static float DistanceToPoint(Vector3 vector, Vector3 point)
        {
            Vector3 final = vector - point;

            return final.x * final.x + final.y * final.y + final.z * final.z;
        }

        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }

            public static Vector3 operator -(Vector3 a, Vector3 b)
            {
                return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
            }
        }
    }
}

score 3 · Accepted Answer

在 64 位系统上，该结构仍然是 12 个字节。

由于 DistanceToPoint 没有内联，64 位速度较慢

 2     0 [  0] TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)
23     0 [  0] Texture!TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)
22     0 [  1]   Texture!TextureFromPoints.Program+Vector3.op_Subtraction(Vector3, Vector3)
30    22 [  0] Texture!TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)
10     0 [  1]   Texture!TextureFromPoints.Program+Vector3.sqrMagnitude()
33    32 [  0] Texture!TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)

在 32 位系统上，只有 sqrtMagnitude 是一个函数调用，DistanceToPoint 和 op_Subtraction 是内联的。

score 1 · Accepted Answer

我怀疑 64 位性能与对齐有关。你的 Vector3 是一个 12 字节的结构；这些将在 32 位环境中占用 12 个字节，但在 64 位环境中它们将被填充到 16 个字节。如果这意味着您的阵列要大 33%，那么您可以预期多出 33% 的缓存未命中。

我的怀疑完全不正确。在上面睡觉后，我尝试了以下方法：

class Program
{
    private struct V3
    {
        public float x;
        public float y;
        public float z;
    }

    private static unsafe long GetDistance()
    {
        var array = new V3[2];
        fixed (V3* pointerOne = &array[0])
        fixed (V3* pointerTwo = &array[1])
            return ((byte*)pointerTwo - (byte*)pointerOne);
    }

    unsafe static void Main()
    {
        Console.WriteLine(GetDistance());
        Console.WriteLine(sizeof(IntPtr));
    }
}

输出，32 位系统：

12
4

输出，64位系统：

12
8

score 0 · Accepted Answer

我知道要做什么！用 F# 写！

Completed 1024x1024 pixels with 700 points in...
Sync: 4393ms
Parallel: 2409ms

它更快，更小......对于我在几个小时内擦掉的东西来说还不错，几乎没有语言知识。

module Program

open System
open System.IO
open System.Linq
open System.Threading.Tasks


let main() = 
    let numPoints = 700
    let textureSize = 1024
    let rnd = new Random()

    let randomPos() = (single (rnd.NextDouble()*(double textureSize)))
    let pointCloud = Array.init numPoints (fun _ -> (randomPos(), randomPos()))

    let distanceToPoint(sourceX :int ,sourceY : int, point ) = 
        let x = (single sourceX) - fst point
        let y = (single sourceY) - snd point
        x*x + y*y

    let syncCalc() =
        let resultData = Array2D.zeroCreate<single*single>  textureSize textureSize
        for x in 0..(textureSize-1) do
          for y in 0..(textureSize-1) do
             let mutable closestPoint = pointCloud.[0]
             let mutable closestDistance = distanceToPoint(x,y, closestPoint)
             for p in 1..(numPoints-1) do
                 let point = pointCloud.[p]
                 let distance = distanceToPoint(x,y, closestPoint)
                 if (distance < closestDistance) then
                    closestPoint <- point
                    closestDistance <- distance
             resultData.[x,y] <- closestPoint

    (*let asyncCalc() =
        let resultData = Array2D.zeroCreate<single*single>  textureSize textureSize
        let z = 
            Async.Parallel [ 
                for x in 0..(textureSize-1) -> async { 
                     for y in 0..(textureSize-1) do
                         let closestPoint = ref pointCloud.[0]
                         let closestDistance = ref (distanceToPoint(x,y, !closestPoint))
                         for p in 1..(numPoints-1) do
                             let point = pointCloud.[p]
                             let distance = distanceToPoint(x,y, !closestPoint)
                             if (distance < !closestDistance) then
                                closestPoint := point
                                closestDistance := distance
                         resultData.[x,y] <- !closestPoint
            } ]   |>Async.RunSynchronously 
        resultData*)

    let parallelCalc() =
        let resultData = Array2D.zeroCreate<single*single>  textureSize textureSize
        let z = 
            Parallel.For (0, textureSize,  fun x ->
                     for y in 0..(textureSize-1) do
                         let closestPoint = ref pointCloud.[0]
                         let closestDistance = ref (distanceToPoint(x,y, !closestPoint))
                         for p in 1..(numPoints-1) do
                             let point = pointCloud.[p]
                             let distance = distanceToPoint(x,y, !closestPoint)
                             if (distance < !closestDistance) then
                                closestPoint := point
                                closestDistance := distance
                         resultData.[x,y] <- !closestPoint)
        resultData

    //4.2s
    let sw1 = System.Diagnostics.Stopwatch.StartNew();
    let r1 = syncCalc() 
    sw1.Stop()

    //Slow!
    //let sw2 = System.Diagnostics.Stopwatch.StartNew();
    //let r2 = asyncCalc()      
    //sw2.Stop()

    //2.25s
    let sw3 = System.Diagnostics.Stopwatch.StartNew();
    let r3 = parallelCalc() 
    sw3.Stop()

    Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints)
    Console.WriteLine("Sync: {0}ms", sw1.ElapsedMilliseconds)
    //Console.WriteLine("ASync: {0}ms", sw2.ElapsedMilliseconds)
    Console.WriteLine("Parallel: {0}ms", sw3.ElapsedMilliseconds)
    Console.ReadLine() |> ignore

while true do main()

c# - Parallel.For... 的 5 倍性能在双核上？

4 回答 4

更新：

Related

Reference