c# - C#：阿特金筛法的实现

Question

我想知道这里是否有人有他们想要分享的阿特金筛子的良好实现。

我正在尝试实现它，但无法完全理解它。这是我到目前为止所拥有的。

public class Atkin : IEnumerable<ulong>
{
    private readonly List<ulong> primes;
    private readonly ulong limit;

    public Atkin(ulong limit)
    {
        this.limit = limit;
        primes = new List<ulong>();
    }

    private void FindPrimes()
    {
        var isPrime = new bool[limit + 1];
        var sqrt = Math.Sqrt(limit);

        for (ulong x = 1; x <= sqrt; x++)
            for (ulong y = 1; y <= sqrt; y++)
            {
                var n = 4*x*x + y*y;
                if (n <= limit && (n % 12 == 1 || n % 12 == 5))
                    isPrime[n] ^= true;

                n = 3*x*x + y*y;
                if (n <= limit && n % 12 == 7)
                    isPrime[n] ^= true;

                n = 3*x*x - y*y;
                if (x > y && n <= limit && n % 12 == 11)
                    isPrime[n] ^= true;
            }

        for (ulong n = 5; n <= sqrt; n++)
            if (isPrime[n])
                for (ulong k = n*n; k <= limit; k *= k)
                    isPrime[k] = false;

        primes.Add(2);
        primes.Add(3);
        for (ulong n = 5; n <= limit; n++)
            if (isPrime[n])
                primes.Add(n);
    }


    public IEnumerator<ulong> GetEnumerator()
    {
        if (!primes.Any())
            FindPrimes();


        foreach (var p in primes)
            yield return p;
    }


    IEnumerator IEnumerable.GetEnumerator()
    {
        return GetEnumerator();
    }
}

我几乎只是试图“翻译” Wikipedia中列出的伪代码，但它无法正常工作。所以要么我误解了什么，要么只是做错了什么。或者很可能两者都...

有一个我用作测试的前 500 个素数的列表，我的实现在第 40 号（或 41？）处失败。

索引 [40] 处的值不同
预期：179
但原为：175

您是否能够找到我的错误，您是否有可以共享的实现，或两者兼而有之？

我正在使用的确切测试如下所示：

public abstract class AtkinTests
{
    [Test]
    public void GetEnumerator_FirstFiveHundredNumbers_AreCorrect()
    {
        var sequence = new Atkin(2000000);
        var actual = sequence.Take(500).ToArray();
        var expected = First500;

        CollectionAssert.AreEqual(expected, actual);
    }

    private static readonly ulong[] First500 = new ulong[]
        {
            2, 3, 5, 7, 11, 13, 17, ...
        };
}

score 10 · Accepted Answer

这段代码：

for (ulong k = n*n; k <= limit; k *= k)
  isPrime[k] = false;

似乎不是这个伪代码的忠实翻译：

is_prime(k) ← false, k ∈ {n², 2n², 3n², ..., limit}

您的代码看起来将运行 n * n、n ^ 4、n ^ 8 等。即每次平方而不是每次都添加 n 平方。尝试这个：

ulong nSquared = n * n;
for (ulong k = nSquared; k <= limit; k += nSquared)
  isPrime[k] = false;

score 6 · Accepted Answer

Aaron Mugatroyd的最后一个答案来自阿特金筛 (SoA) 的翻译 Python 源代码并不算太糟糕，但它可以在几个方面进行改进，因为它错过了一些重要的优化，如下所示：

他的答案没有使用完整的模 60 原始 Atkin 和 Bernstein 版本的筛子，而是对维基百科文章中的伪代码进行了略微改进的变体，因此使用了大约 0.36 倍的数值筛子范围组合切换/剔除操作；我下面的代码使用了相当有效的非页面段伪代码，根据我在对阿特金筛子的评论中的评论，它使用了大约 0.26 倍数值范围的因子来将完成的工作量减少到大约一个因子七分之二。
他的代码通过仅具有奇数表示来减少缓冲区大小，而我的代码进一步打包以消除可被三和五整除的数字以及“仅奇数”所暗示的可被二整除的数字的任何表示；这将内存需求进一步减少了近一半（至 8/15），并有助于更好地利用 CPU 缓存，由于减少了平均内存访问时间，从而进一步提高了速度。
我的代码使用快速查找表 (LUT) 弹出计数技术计算素数的数量，与使用他使用的逐位技术大约一秒钟相比，几乎不需要时间来计数；然而，在这个示例代码中，即使是一小段时间也会从代码的定时部分中取出。
最后，我的代码针对每个内部循环的最少代码优化了位操作操作。例如，它不使用连续右移 1 来产生奇数表示索引，实际上通过将所有内部循环写入为常数模（等于位位置）操作来进行一点位移。同样，Aaron 的翻译代码在操作中效率非常低，例如在素数平方自由剔除中，它将素数的平方添加到索引中，然后检查奇数结果，而不是仅添加平方的两倍以便不需要检查; 然后它通过在内部循环中执行剔除操作之前将数字右移一（除以二）来使检查变得多余，就像它对所有循环所做的那样。这个低效的代码不会 t 使用这种“大筛缓冲阵列”技术对大范围的执行时间产生很大影响，因为每个操作的大部分时间都用于 RAM 内存访问（对于 10 亿范围，大约 37 个 CPU 时钟周期或更多），但是对于适合 CPU 缓存的较小范围，执行时间会比所需的慢得多；换句话说，它为每个操作的执行速度设置了过高的最低限制。

代码如下：

//Sieve of Atkin based on full non page segmented modulo 60 implementation...

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;

namespace NonPagedSoA {
  //implements the non-paged Sieve of Atkin (full modulo 60 version)...
  class SoA : IEnumerable<ulong> {
    private ushort[] buf = null;
    private long cnt = 0;
    private long opcnt = 0;
    private static byte[] modPRMS = { 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61 };
    private static ushort[] modLUT;
    private static byte[] cntLUT;
    //initialize the private LUT's...
    static SoA() {
      modLUT = new ushort[60];
      for (int i = 0, m = 0; i < modLUT.Length; ++i) {
        if ((i & 1) != 0 || (i + 7) % 3 == 0 || (i + 7) % 5 == 0) modLUT[i] = 0;
        else modLUT[i] = (ushort)(1 << (m++));
      }
      cntLUT = new byte[65536];
      for (int i = 0; i < cntLUT.Length; ++i) {
        var c = 0;
        for (int j = i; j > 0; j >>= 1) c += j & 1;
        cntLUT[i] = (byte)c;
      }
    }
    //initialization and all the work producing the prime bit array done in the constructor...
    public SoA(ulong range) {
      this.opcnt = 0;
      if (range < 7) {
        if (range > 1) {
          cnt = 1;
          if (range > 2) this.cnt += (long)(range - 1) / 2;
        }
        this.buf = new ushort[0];
      }
      else {
        this.cnt = 3;
        var nrng = range - 7; var lmtw = nrng / 60;
        //initialize sufficient wheels to non-prime
        this.buf = new ushort[lmtw + 1];

        //Put in candidate primes:
        //for the 4 * x ^ 2 + y ^ 2 quadratic solution toggles - all x odd y...
        ulong n = 6; // equivalent to 13 - 7 = 6...
        for (uint x = 1, y = 3; n <= nrng; n += (x << 3) + 4, ++x, y = 1) {
          var cb = n; if (x <= 1) n -= 8; //cancel the effect of skipping the first one...
          for (uint i = 0; i < 15 && cb <= range; cb += (y << 2) + 4, y += 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0)
              for (uint c = (uint)cbd, my = y + 15; c < buf.Length; c += my, my += 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
          }
        }
        //for the 3 * x ^ 2 + y ^ 2 quadratic solution toggles - x odd y even...
        n = 0; // equivalent to 7 - 7 = 0...
        for (uint x = 1, y = 2; n <= nrng; n += ((x + x + x) << 2) + 12, x += 2, y = 2) {
          var cb = n;
          for (var i = 0; i < 15 && cb <= range; cb += (y << 2) + 4, y += 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0)
              for (uint c = (uint)cbd, my = y + 15; c < buf.Length; c += my, my += 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
          }
        }
        //for the 3 * x ^ 2 - y ^ 2 quadratic solution toggles all x and opposite y = x - 1...
        n = 4; // equivalent to 11 - 7 = 4...
        for (uint x = 2, y = x - 1; n <= nrng; n += (ulong)(x << 2) + 4, y = x, ++x) {
          var cb = n; int i = 0;
          for ( ; y > 1 && i < 15 && cb <= nrng; cb += (ulong)(y << 2) - 4, y -= 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0) {
              uint c = (uint)cbd, my = y;
              for ( ; my >= 30 && c < buf.Length; c += my - 15, my -= 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
              if (my > 0 && c < buf.Length) { buf[c] ^= cm; /* ++this.opcnt; */ }
            }
          }
          if (y == 1 && i < 15) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if ((cm & 0x4822) != 0 && cbd < (ulong)buf.Length) { buf[cbd] ^= cm; /* ++this.opcnt; */ }
          }
        }

        //Eliminate squares of base primes, only for those on the wheel:
        for (uint i = 0, w = 0, pd = 0, pn = 0, msk = 1; w < this.buf.Length ; ++i) {
          uint p = pd + modPRMS[pn];
          ulong sqr = (ulong)p * (ulong)p; //to handle ranges above UInt32.MaxValue
          if (sqr > range) break;
          if ((this.buf[w] & msk) != 0) { //found base prime, square free it...
            ulong s = sqr - 7;
            for (int j = 0; s <= nrng && j < modPRMS.Length; s = sqr * modPRMS[j] - 7, ++j) {
              var cd = s / 60; var cm = (ushort)(modLUT[s % 60] ^ 0xFFFF);
              //may need ulong loop index for ranges larger than two billion
              //but buf length only good to about 2^31 * 60 = 120 million anyway,
              //even with large array setting and half that with 32-bit...
              for (ulong c = cd; c < (ulong)this.buf.Length; c += sqr) {
                this.buf[c] &= cm; // ++this.opcnt;
              }
            }
          }
          if (msk >= 0x8000) { msk = 1; pn = 0; ++w; pd += 60; }
          else { msk <<= 1; ++pn; }
        }

        //clear any overflow primes in the excess space in the last wheel/word:
        var ndx = nrng % 60; //clear any primes beyond the range
        for (; modLUT[ndx] == 0; --ndx) ;
        this.buf[lmtw] &= (ushort)((modLUT[ndx] << 1) - 1);
      }
    }

    //uses a fast pop count Look Up Table to return the total number of primes...
    public long Count {
      get {
        long cnt = this.cnt;
        for (int i = 0; i < this.buf.Length; ++i) cnt += cntLUT[this.buf[i]];
        return cnt;
      }
    }

    //returns the number of toggle/cull operations used to sieve the prime bit array...
    public long Ops {
      get {
        return this.opcnt;
      }
    }

    //generate the enumeration of primes...
    public IEnumerator<ulong> GetEnumerator() {
      yield return 2; yield return 3; yield return 5;
      ulong pd = 0;
      for (uint i = 0, w = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
        if ((this.buf[w] & msk) != 0) //found a prime bit...
          yield return pd + modPRMS[pn]; //add it to the list
        if (msk >= 0x8000) { msk = 1; pn = 0; ++w; pd += 60; }
        else { msk <<= 1; ++pn; }
      }
    }

    //required for the above enumeration...
    IEnumerator IEnumerable.GetEnumerator() {
      return this.GetEnumerator();
    }
  }

  class Program {
    static void Main(string[] args) {
      Console.WriteLine("This program calculates primes by a simple full version of the Sieve of Atkin.\r\n");

      const ulong n = 1000000000;

      var elpsd = -DateTime.Now.Ticks;

      var gen = new SoA(n);

      elpsd += DateTime.Now.Ticks;

      Console.WriteLine("{0} primes found to {1} using {2} operations in {3} milliseconds.", gen.Count, n, gen.Ops, elpsd / 10000);

      //Output prime list for testing...
      //Console.WriteLine();
      //foreach (var p in gen) {
      //  Console.Write(p + " ");
      //}
      //Console.WriteLine();

//Test options showing what one can do with the enumeration, although more slowly...
//      Console.WriteLine("\r\nThere are {0} primes with the last one {1} and the sum {2}.",gen.Count(),gen.Last(),gen.Sum(x => (long)x));

      Console.Write("\r\nPress any key to exit:");
      Console.ReadKey(true);
      Console.WriteLine();
    }
  }
}

这段代码的运行速度大约是 Aaron 代码的两倍（在 i7-2700K (3.5 GHz) 上使用 64 位或 32 位模式大约需要 2.7 秒，缓冲区大约 16.5 MB 和大约 2.58 亿次组合切换/素数平方自由剔除操作）（可以通过取消注释“++this.opcnt”语句来显示）对于 10 亿个筛选范围，相比之下，他的代码没有计数时间和几乎使用约 3.59 亿次组合切换/剔除操作来筛选多达 10 亿次的内存使用量是内存使用量的两倍。

尽管它比他最优化的非分页埃拉托色尼筛 (SoE) 的天真赔率实现要快，但这并没有使阿特金筛比埃拉托色尼筛更快，就好像有人应用了与本文中使用的类似技术一样上述 SoA 对 SoE plus 的实现使用最大车轮分解，SoE 将与此速度大致相同。

分析： 尽管完全优化的 SoE 的操作数与 10 亿筛子范围的 SoA 的操作数大致相同，但这些非分页实现的主要瓶颈是一旦筛子缓冲区大小超过内存访问CPU 缓存大小（对于我的 i7，一个时钟周期访问时 32 KiloBytes L1 缓存，大约四个时钟周期访问时间时 256 Kilobytes L2 缓存和大约 20 个时钟周期访问时间时 8 MB L3 缓存），之后内存访问可能超过一百个时钟周期。

现在，当将算法调整为页面分段时，两者的内存访问速度都提高了大约八倍，因此可以筛选出原本不适合可用内存的范围。然而，由于在快速增长到数百倍的剔除扫描中的巨大进步，难以实现算法的“无素数平方”部分，因此随着筛子范围开始变得非常大，SoE 继续超过 SoA页缓冲区的大小。同样，也许更严重的是，在每个页面缓冲区的最低表示中计算每个 'x' 值的新起点和 'y' 值的新起点会变得非常内存和/或计算量很大。随着范围的增加，与 SoE 相比，分页 SoA 的效率损失很大。

EDIT_ADD： Aaron Murgatroyd 使用的仅赔率 SoE 使用约 10.26 亿次剔除操作，筛选范围为 10 亿次，因此操作数大约是 SoA 的四倍，因此运行速度应该慢四倍，但 SoA 甚至在实施时这里有一个更复杂的内部循环，特别是由于高比例的仅赔率 SoE 剔除在剔除扫描中的步幅比 SoA 的步幅短得多，天真的仅赔率 SoE 具有更好的平均内存访问时间尽管筛缓冲区大大超过了 CPU 缓存大小（更好地利用缓存关联性）。这就解释了为什么上述 SoA 的速度仅是仅有赔率的 SoE 的两倍，尽管理论上它似乎只完成了四分之一的工作。

如果使用与上述 SoA 相同的使用恒定模内循环的算法并实现相同的 2/3/5 轮因式分解，那么 SoE 会将剔除操作的数量减少到约 4.05 亿次操作，因此仅增加约 50%操作比 SoA 运行，理论上运行速度比 SoA 稍慢，但可能以大致相同的速度运行，因为对于这种“幼稚”的大内存缓冲区使用，平均而言，剔除步幅仍然比 SoA 小一些。将车轮分解增加到 2/3/5/7 车轮意味着对于 10 亿个剔除范围，SoE 剔除操作减少到大约 0.314，并且可能使该版本的 SoE 运行该算法的速度大致相同。

可以通过预先剔除 2/3/5/7/11/13/17/19 素因子的筛阵列（以模式复制）来进一步使用轮分解，几乎不花费执行时间来减少对于 10 亿的筛选范围，剔除操作的总数达到约 2.51 亿，并且 SoE 的运行速度甚至比这个优化的 SoA 版本更快或大致相同，即使对于这些大内存缓冲区版本，SoE 仍然有很多比上面的代码复杂度低。

因此，可以看出，SoE 的操作数量可以从天真的或仅偶数赔率或 2/3/5 轮因式分解版本大大减少，使得操作数量与 SoA 大致相同，而同时，由于更简单的内部循环和更有效的内存访问，每个操作的时间实际上可能更少。 END_EDIT_ADD

EDIT_ADD2：我在这里根据上面链接的答案进一步下方的伪代码，使用与上面的 SoA 类似的常量模数/位位置技术为最内层循环添加 SoE 的代码。尽管应用了高轮分解和预剔除，但代码比上述 SoA 复杂得多，因此剔除操作的总数实际上少于 SoA 的组合切换/剔除操作（直到筛分范围）约二十亿。代码如下：

EDIT_FINAL更正了下面的代码和与之相关的评论END_EDIT_FINAL

//Sieve of Eratosthenes based on maximum wheel factorization and pre-culling implementation...

using System;
using System.Collections;
using System.Collections.Generic;

namespace NonPagedSoE {
  //implements the non-paged Sieve of Eratosthenes (full modulo 210 version with preculling)...
  class SoE : IEnumerable<ulong> {
    private ushort[] buf = null;
    private long cnt = 0;
    private long opcnt = 0;
    private static byte[] basePRMS = { 2, 3, 5, 7, 11, 13, 17, 19 };
    private static byte[] modPRMS = { 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, //positions + 23
                                      97, 101, 103, 107, 109, 113, 121, 127, 131, 137, 139, 143, 149, 151, 157, 163,
                                      167, 169, 173, 179, 181 ,187 ,191 ,193, 197, 199, 209, 211, 221, 223, 227, 229 };
    private static byte[] gapsPRMS = { 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6, 4, 6, 8,
                                       4, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6, 6, 4,
                                       2, 4, 6, 2, 6, 4, 2, 4, 2, 10, 2, 10, 2, 4, 2, 4 };
    private static ulong[] modLUT;
    private static byte[] cntLUT;
    //initialize the private LUT's...
    static SoE() {
      modLUT = new ulong[210];
      for (int i = 0, m = 0; i < modLUT.Length; ++i) {
        if ((i & 1) != 0 || (i + 23) % 3 == 0 || (i + 23) % 5 == 0 || (i + 23) % 7 == 0) modLUT[i] = 0;
        else modLUT[i] = 1UL << (m++);
      }
      cntLUT = new byte[65536];
      for (int i = 0; i < cntLUT.Length; ++i) {
        var c = 0;
        for (int j = i ^ 0xFFFF; j > 0; j >>= 1) c += j & 1; //reverse logic; 0 is prime; 1 is composite
        cntLUT[i] = (byte)c;
      }
    }
    //initialization and all the work producing the prime bit array done in the constructor...
    public SoE(ulong range) {
      this.opcnt = 0;
      if (range < 23) {
        if (range > 1) {
          for (int i = 0; i < modPRMS.Length; ++i) if (modPRMS[i] <= range) this.cnt++; else break;
        }
        this.buf = new ushort[0];
      }
      else {
        this.cnt = 8;
        var nrng = range - 23; var lmtw = nrng / 210; var lmtwt3 = lmtw * 3; 
        //initialize sufficient wheels to prime
        this.buf = new ushort[lmtwt3 + 3]; //initial state of all zero's is all potential prime.

        //initialize array to account for preculling the primes of 11, 13, 17, and 19;
        //(2, 3, 5, and 7 already eliminated by the bit packing to residues).
        for (int pn = modPRMS.Length - 4; pn < modPRMS.Length; ++pn) {
          uint p = modPRMS[pn] - 210u; ulong pt3 = p * 3;
          ulong s = p * p - 23;
          ulong xrng = Math.Min(9699709, nrng); // only do for the repeating master pattern size
          ulong nwrds = (ulong)Math.Min(138567, this.buf.Length);
          for (int j = 0; s <= xrng && j < modPRMS.Length; s += p * gapsPRMS[(pn + j++) % 48]) {
            var sm = modLUT[s % 210];
            var si = (sm < (1UL << 16)) ? 0UL : ((sm < (1UL << 32)) ? 1UL : 2UL);
            var cd = s / 210 * 3 + si; var cm = (ushort)(sm >> (int)(si << 4));
            for (ulong c = cd; c < nwrds; c += pt3) { //tight culling loop for size of master pattern
              this.buf[c] |= cm; // ++this.opcnt; //reverse logic; mark composites with ones.
            }
          }
        }
        //Now copy the master pattern so it repeats across the main buffer, allow for overflow...
        for (long i = 138567; i < this.buf.Length; i += 138567)
          if (i + 138567 <= this.buf.Length)
            Array.Copy(this.buf, 0, this.buf, i, 138567);
          else Array.Copy(this.buf, 0, this.buf, i, this.buf.Length - i);

        //Eliminate all composites which are factors of base primes, only for those on the wheel:
        for (uint i = 0, w = 0, wi = 0, pd = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
          uint p = pd + modPRMS[pn];
          ulong sqr = (ulong)p * (ulong)p;
          if (sqr > range) break;
          if ((this.buf[w] & msk) == 0) { //found base prime, mark its composites...
            ulong s = sqr - 23; ulong pt3 = p * 3;
            for (int j = 0; s <= nrng && j < modPRMS.Length; s += p * gapsPRMS[(pn + j++) % 48]) {
              var sm = modLUT[s % 210];
              var si = (sm < (1UL << 16)) ? 0UL : ((sm < (1UL << 32)) ? 1UL : 2UL);
              var cd = s / 210 * 3 + si; var cm = (ushort)(sm >> (int)(si << 4));
              for (ulong c = cd; c < (ulong)this.buf.Length; c += pt3) { //tight culling loop
                this.buf[c] |= cm; // ++this.opcnt; //reverse logic; mark composites with ones.
              }
            }
          }
          ++pn;
          if (msk >= 0x8000) { msk = 1; ++w; ++wi; if (wi == 3) { wi = 0; pn = 0; pd += 210; } }
          else msk <<= 1;
        }

        //clear any overflow primes in the excess space in the last wheel/word:
        var ndx = nrng % 210; //clear any primes beyond the range
        for (; modLUT[ndx] == 0; --ndx) ;
        var cmsk = (~(modLUT[ndx] - 1)) << 1; //force all bits above to be composite ones.
        this.buf[lmtwt3++] |= (ushort)cmsk;
        this.buf[lmtwt3++] |= (ushort)(cmsk >> 16);
        this.buf[lmtwt3] |= (ushort)(cmsk >> 32);
      }
    }

    //uses a fast pop count Look Up Table to return the total number of primes...
    public long Count {
      get {
        long cnt = this.cnt;
        for (int i = 0; i < this.buf.Length; ++i) cnt += cntLUT[this.buf[i]];
        return cnt;
      }
    }

    //returns the number of cull operations used to sieve the prime bit array...
    public long Ops {
      get {
        return this.opcnt;
      }
    }

    //generate the enumeration of primes...
    public IEnumerator<ulong> GetEnumerator() {
      yield return 2; yield return 3; yield return 5; yield return 7;
      yield return 11; yield return 13; yield return 17; yield return 19;
      ulong pd = 0;
      for (uint i = 0, w = 0, wi = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
        if ((this.buf[w] & msk) == 0) //found a prime bit...
          yield return pd + modPRMS[pn];
        ++pn;
        if (msk >= 0x8000) { msk = 1; ++w; ++wi; if (wi == 3) { wi = 0; pn = 0; pd += 210; } }
        else msk <<= 1;
      }
    }

    //required for the above enumeration...
    IEnumerator IEnumerable.GetEnumerator() {
      return this.GetEnumerator();
    }
  }

  class Program {
    static void Main(string[] args) {
      Console.WriteLine("This program calculates primes by a simple maximually wheel factorized version of the Sieve of Eratosthenes.\r\n");

      const ulong n = 1000000000;

      var elpsd = -DateTime.Now.Ticks;

      var gen = new SoE(n);

      elpsd += DateTime.Now.Ticks;

      Console.WriteLine("{0} primes found to {1} using {2} operations in {3} milliseconds.", gen.Count, n, gen.Ops, elpsd / 10000);

//      Console.WriteLine();
//      foreach (var p in gen) {
//        Console.Write(p + " ");
//      }
//      Console.WriteLine();

      //      Console.WriteLine("\r\nThere are {0} primes with the last one {1} and the sum {2}.",gen.Count(),gen.Last(),gen.Sum(x => (long)x));

      Console.Write("\r\nPress any key to exit:");
      Console.ReadKey(true);
      Console.WriteLine();
    }
  }
}

这段代码实际上比上面的 SoA 运行速度快了几个百分点，因为它的操作稍微少了一点，而且这个 10 亿范围内的大数组大小的主要瓶颈是内存访问时间大约为 40 到 100 多个 CPU 时钟周期取决于 CPU 和内存规格；这意味着代码优化（除了减少操作总数）是无效的，因为大部分时间都花在等待内存访问上。无论如何，使用巨大的内存缓冲区并不是筛选大范围的最有效方法，使用具有相同最大轮子分解的页面分割（这也为多处理）。

与 SoE 相比，SoA 在实现页面分割和多处理方面确实缺乏超过 40 亿的范围，因为由于 SoA 的渐近复杂性降低而产生的任何收益迅速被与相关的页面处理开销因素所吞噬素数平方免费处理和计算更多的页面起始地址；或者，可以通过将标记存储在 RAM 存储器中来克服这一问题，这会消耗大量内存并进一步降低访问这些标记存储结构的效率。 END_EDIT_ADD2

简而言之，与完全轮式分解的 SoE 相比，SoA 并不是一个真正实用的筛子，因为正如渐近复杂度的增益开始使其性能接近完全优化的 SoE，它开始失去效率，因为关于相对内存访问时间和页面分段复杂性以及通常更复杂和难以编写的实际实现细节。在我看来，与 SoE 相比，它更像是一个有趣的智力概念和心理锻炼，而不是一个实用的筛子。

有一天，我会将这些技术应用到多线程页面分段的 Eratosthenes 筛子中，使其在 C# 中的速度与 Atkin 和 Bernstein 在“C”中的 SoA 的“primegen”实现一样快，并将在大范围内将其从水中吹走超过约 40 亿甚至单线程，当我的 i7 上的多线程（包括超线程在内的八核）时，速度额外提高了约四。

score 3 · Accepted Answer

这是另一个实现。它使用 BitArray 来节省内存。Parallel.For需要 .NET Framework 4 。

static List<int> FindPrimesBySieveOfAtkins(int max)
{
//  var isPrime = new BitArray((int)max+1, false); 
//  Can't use BitArray because of threading issues.
    var isPrime = new bool[max + 1];
    var sqrt = (int)Math.Sqrt(max);

    Parallel.For(1, sqrt, x =>
    {
        var xx = x * x;
        for (int y = 1; y <= sqrt; y++)
        {
            var yy = y * y;
            var n = 4 * xx + yy;
            if (n <= max && (n % 12 == 1 || n % 12 == 5))
                isPrime[n] ^= true;

            n = 3 * xx + yy;
            if (n <= max && n % 12 == 7)
                isPrime[n] ^= true;

            n = 3 * xx - yy;
            if (x > y && n <= max && n % 12 == 11)
                isPrime[n] ^= true;
        }
    });

    var primes = new List<int>() { 2, 3 };
    for (int n = 5; n <= sqrt; n++)
    {
        if (isPrime[n])
        {
            primes.Add(n);
            int nn = n * n;
            for (int k = nn; k <= max; k += nn)
                isPrime[k] = false;
        }
    }

    for (int n = sqrt + 1; n <= max; n++)
        if (isPrime[n])
            primes.Add(n);

    return primes;
}

score 1 · Accepted Answer

这是阿特金筛子的一个更快的实现，我在这里从这个 Python 脚本中窃取了算法（我对算法没有任何功劳）：

http://programmingpraxis.com/2010/02/19/sieve-of-atkin-improved/

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.Int32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Atkin : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// The number of primes calculated or null if not calculated yet.
        /// </summary>
        protected PrimeType? mpCount = null;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Atkin generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Atkin(PrimeType limit)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mpLimit = limit;

            FindPrimes();
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            PrimeType lpYLimit, lpN, lpXX3, lpXX4, lpDXX, lpDN, lpDXX4, lpXX, lpX, lpYY, lpMinY, lpS, lpK;

            fixed (BitArrayType* lpbOddPrime = &mbaOddPrime[0])
            {
                // n = 3x^2 + y^2 section
                lpXX3 = 3;
                for (lpDXX = 0; lpDXX < 12 * SQRT((mpLimit - 1) / 3); lpDXX += 24)
                {
                    lpXX3 += lpDXX;
                    lpYLimit = (12 * SQRT(mpLimit - lpXX3)) - 36;
                    lpN = lpXX3 + 16;

                    for (lpDN = -12; lpDN < lpYLimit + 1; lpDN += 72)
                    {
                        lpN += lpDN;
                        lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                            (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }

                    lpN = lpXX3 + 4;
                    for (lpDN = 12; lpDN < lpYLimit + 1; lpDN += 72)
                    {
                        lpN += lpDN;
                        lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                            (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }
                }

                //    # n = 4x^2 + y^2 section
                lpXX4 = 0;
                for (lpDXX4 = 4; lpDXX4 < 8 * SQRT((mpLimit - 1) / 4) + 4; lpDXX4 += 8)
                {
                    lpXX4 += lpDXX4;
                    lpN = lpXX4 + 1;

                    if ((lpXX4 % 3) != 0)
                    {
                        for (lpDN = 0; lpDN < (4 * SQRT(mpLimit - lpXX4)) - 3; lpDN += 8)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }
                    }
                    else
                    {
                        lpYLimit = (12 * SQRT(mpLimit - lpXX4)) - 36;
                        lpN = lpXX4 + 25;

                        for (lpDN = -24; lpDN < lpYLimit + 1; lpDN += 72)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }

                        lpN = lpXX4 + 1;
                        for (lpDN = 24; lpDN < lpYLimit + 1; lpDN += 72)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }
                    }
                }

                //    # n = 3x^2 - y^2 section
                lpXX = 1;
                for (lpX = 3; lpX < SQRT(mpLimit / 2) + 1; lpX += 2)
                {
                    lpXX += 4 * lpX - 4;
                    lpN = 3 * lpXX;

                    if (lpN > mpLimit)
                    {
                        lpMinY = ((SQRT(lpN - mpLimit) >> 2) << 2);
                        lpYY = lpMinY * lpMinY;
                        lpN -= lpYY;
                        lpS = 4 * lpMinY + 4;
                    }
                    else
                        lpS = 4;

                    for (lpDN = lpS; lpDN < 4 * lpX; lpDN += 8)
                    {
                        lpN -= lpDN;
                        if (lpN <= mpLimit && lpN % 12 == 11)
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }
                }

                // xx = 0
                lpXX = 0;
                for (lpX = 2; lpX < SQRT(mpLimit / 2) + 1; lpX += 2)
                {
                    lpXX += 4*lpX - 4;
                    lpN = 3*lpXX;

                    if (lpN > mpLimit)
                    {
                        lpMinY = ((SQRT(lpN - mpLimit) >> 2) << 2) - 1;
                        lpYY = lpMinY * lpMinY;
                        lpN -= lpYY;
                        lpS = 4*lpMinY + 4;
                    }
                    else
                    {
                        lpN -= 1;
                        lpS = 0;
                    }

                    for (lpDN = lpS; lpDN < 4 * lpX; lpDN += 8)
                    {
                        lpN -= lpDN;
                        if (lpN <= mpLimit && lpN % 12 == 11)
                            lpbOddPrime[(lpN>>1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN>>1) % cbBitsPerBlock));
                    }
                }

                // # eliminate squares
                for (lpN = 5; lpN < SQRT(mpLimit) + 1; lpN += 2)
                    if ((lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] & ((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock))) != 0)
                        for (lpK = lpN * lpN; lpK < mpLimit; lpK += lpN * lpN)
                            if ((lpK & 1) == 1)
                                lpbOddPrime[(lpK >> 1) / cbBitsPerBlock] &=
                                    (BitArrayType)~((BitArrayType)1 << (int)((lpK >> 1) % cbBitsPerBlock));
            }
        }

        /// <summary>
        /// Calculates the truncated square root for a number.
        /// </summary>
        /// <param name="value">The value to get the square root for.</param>
        /// <returns>The truncated sqrt of the value.</returns>
        private unsafe PrimeType SQRT(PrimeType value)
        {
            return (PrimeType)Math.Sqrt(value);
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            if ((index & 1) == 1)
                return (bits[(index >> 1) / cbBitsPerBlock] & ((BitArrayType)1 << (int)((index >> 1) % cbBitsPerBlock))) != 0;
            else
                return false;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                if (!mpCount.HasValue)
                {
                    PrimeType lpCount = 0;
                    foreach (PrimeType liPrime in this) lpCount++;
                    mpCount = lpCount;
                }

                return mpCount.Value;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            //    return [2,3] + filter(primes.__getitem__, xrange(5,limit,2))

            // Two & Three always prime.
            yield return 2;
            yield return 3;

            // Start at first block, third MSB (5).
            int liBlock = 0;
            byte lbBit = 2;
            BitArrayType lbaCurrent = mbaOddPrime[0] >> lbBit;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 5; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 1)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value. 
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    lbBit = 0;
                    lbaCurrent = mbaOddPrime[++liBlock];

                    //// Move to first bit of next block skipping full blocks.
                    while (lbaCurrent == 0)
                    {
                        lpN += ((PrimeType)cbBitsPerBlock) << 1;
                        if (lpN <= mpLimit)
                            lbaCurrent = mbaOddPrime[++liBlock];
                        else
                            break;
                    }
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns></returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

它的速度接近我最优化的 Eratosthenes 筛网版本，但仍然慢了大约 20%，可以在这里找到：

https://stackoverflow.com/a/9700790/738380

score 0 · Accepted Answer

这是我的，它使用一个名为 CompartmentalisedParallel 的类，它允许您执行并行 for 循环，但控制线程数，以便对索引进行分组。但是，由于线程问题，您需要在每次更改 BitArray 时锁定它，或者为每个线程创建一个单独的 BitArray，然后在最后将它们异或在一起，第一个选项非常慢，因为锁的数量，第二个选项对我来说似乎更快！

using System;
using System.Collections;
using System.Collections.Generic;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    public class Atkin : Primes
    {
        protected BitArray mbaPrimes;
        protected bool mbThreaded = true;

        public Atkin(int limit)
            : this(limit, true)
        {
        }

        public Atkin(int limit, bool threaded)
            : base(limit)
        {
            mbThreaded = threaded;
            if (mbaPrimes == null) FindPrimes();
        }

        public bool Threaded
        {
            get
            {
                return mbThreaded;
            }
        }

        public override IEnumerator<int> GetEnumerator()
        {
            yield return 2;
            yield return 3;
            for (int lsN = 5; lsN <= msLimit; lsN += 2)
                if (mbaPrimes[lsN]) yield return lsN;
        }

        private void FindPrimes()
        {
            mbaPrimes = new BitArray(msLimit + 1, false);

            int lsSQRT = (int)Math.Sqrt(msLimit);

            int[] lsSquares = new int[lsSQRT + 1];
            for (int lsN = 0; lsN <= lsSQRT; lsN++)
                lsSquares[lsN] = lsN * lsN;

            if (Threaded)
            {
                CompartmentalisedParallel.For<BitArray>(
                    1, lsSQRT + 1, new ParallelOptions(),
                    (start, finish) => { return new BitArray(msLimit + 1, false); },
                    (lsX, lsState, lbaLocal) =>
                    {
                        int lsX2 = lsSquares[lsX];

                        for (int lsY = 1; lsY <= lsSQRT; lsY++)
                        {
                            int lsY2 = lsSquares[lsY];

                            int lsN = 4 * lsX2 + lsY2;
                            if (lsN <= msLimit && (lsN % 12 == 1 || lsN % 12 == 5))
                                lbaLocal[lsN] ^= true;

                            lsN -= lsX2;
                            if (lsN <= msLimit && lsN % 12 == 7)
                                lbaLocal[lsN] ^= true;

                            if (lsX > lsY)
                            {
                                lsN -= lsY2 * 2;
                                if (lsN <= msLimit && lsN % 12 == 11)
                                    lbaLocal[lsN] ^= true;
                            }
                        }

                        return lbaLocal;
                    },
                    (lbaResult, start, finish) =>
                    {
                        lock (mbaPrimes) 
                            mbaPrimes.Xor(lbaResult);
                    },
                    -1
                );
            }
            else
            {
                for (int lsX = 1; lsX <= lsSQRT; lsX++)
                {
                    int lsX2 = lsSquares[lsX];

                    for (int lsY = 1; lsY <= lsSQRT; lsY++)
                    {
                        int lsY2 = lsSquares[lsY];

                        int lsN = 4 * lsX2 + lsY2;
                        if (lsN <= msLimit && (lsN % 12 == 1 || lsN % 12 == 5))
                            mbaPrimes[lsN] ^= true;

                        lsN -= lsX2;
                        if (lsN <= msLimit && lsN % 12 == 7)
                            mbaPrimes[lsN] ^= true;

                        if (lsX > lsY)
                        {
                            lsN -= lsY2 * 2;
                            if (lsN <= msLimit && lsN % 12 == 11)
                                mbaPrimes[lsN] ^= true;
                        }
                    }
                }
            }

            for (int lsN = 5; lsN < lsSQRT; lsN += 2)
                if (mbaPrimes[lsN])
                {
                    var lsS = lsSquares[lsN];
                    for (int lsK = lsS; lsK <= msLimit; lsK += lsS)
                        mbaPrimes[lsK] = false;
                }
        }
    }
}

和 CompartmentalisedParallel 类：

using System;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    public static class CompartmentalisedParallel
    {
        #region Int

        private static int[] CalculateCompartments(int startInclusive, int endExclusive, ref int threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = endExclusive - startInclusive;

            int[] liThreadIndexes = new int[threads + 1];
            liThreadIndexes[threads] = endExclusive - 1;
            int liIndexesPerThread = (endExclusive - startInclusive) / threads;
            for (int liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        public static void For<TLocal>(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Func<int, int, TLocal> localInit,
            Func<int, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, int, int> localFinally,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        TLocal llLocal = localInit(liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);

                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState, llLocal);

                        localFinally(llLocal, liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);
                    }
                );
            else
            {
                TLocal llLocal = localInit(startInclusive, endExclusive);
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null, llLocal);
                localFinally(llLocal, startInclusive, endExclusive);
            }
        }

        public static void For(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Action<int, ParallelLoopState> body,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState);
                    }
                );
            else
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null);
        }

        public static void For(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Action<int> body,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread) =>
                    {
                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter);
                    }
                );
            else
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter);
        }

        public static void For(
            int startInclusive, int endExclusive,
            Action<int, ParallelLoopState> body,
            int threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For(
            int startInclusive, int endExclusive,
            Action<int> body,
            int threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For<TLocal>(
            int startInclusive, int endExclusive,
            Func<int, int, TLocal> localInit,
            Func<int, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, int, int> localFinally,
            int threads)
        {
            For<TLocal>(startInclusive, endExclusive, new ParallelOptions(), localInit, body, localFinally, threads);
        }

        #endregion

        #region Long

        private static long[] CalculateCompartments(long startInclusive, long endExclusive, ref long threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = endExclusive - startInclusive;

            long[] liThreadIndexes = new long[threads + 1];
            liThreadIndexes[threads] = endExclusive - 1;
            long liIndexesPerThread = (endExclusive - startInclusive) / threads;
            for (long liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        public static void For<TLocal>(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Func<long, long, TLocal> localInit,
            Func<long, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, long, long> localFinally,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        TLocal llLocal = localInit(liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);

                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState, llLocal);

                        localFinally(llLocal, liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);
                    }
                );
            else
            {
                TLocal llLocal = localInit(startInclusive, endExclusive);
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null, llLocal);
                localFinally(llLocal, startInclusive, endExclusive);
            }
        }

        public static void For(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Action<long, ParallelLoopState> body,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState);
                    }
                );
            else
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null);
        }

        public static void For(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Action<long> body,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread) =>
                    {
                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter);
                    }
                );
            else
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter);
        }

        public static void For(
            long startInclusive, long endExclusive,
            Action<long, ParallelLoopState> body,
            long threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For(
            long startInclusive, long endExclusive,
            Action<long> body,
            long threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For<TLocal>(
            long startInclusive, long endExclusive,
            Func<long, long, TLocal> localInit,
            Func<long, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, long, long> localFinally,
            long threads)
        {
            For<TLocal>(startInclusive, endExclusive, new ParallelOptions(), localInit, body, localFinally, threads);
        }

        #endregion
    }
}

素数基类：

using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    public abstract class Primes : IEnumerable<int>
    {
        protected readonly int msLimit;

        public Primes(int limit)
        {
            msLimit = limit;
        }

        public int Limit
        {
            get
            {
                return msLimit;
            }
        }

        public int Count
        {
            get
            {
                int liCount = 0;
                foreach (int liPrime in this)
                    liCount++;
                return liCount;
            }
        }

        public abstract IEnumerator<int> GetEnumerator();

        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }
    }
}

通过执行以下操作来使用它：

    var lpPrimes = new Atkin(count, true);
    Console.WriteLine(lpPrimes.Count);
    Console.WriteLine(s.ElapsedMilliseconds);

然而，我发现 Eratosthenes 在所有情况下都更快，即使是在 Atkin 的多线程模式下运行的四核 CPU：

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    public class Eratosthenes : Primes
    {
        protected BitArray mbaOddEliminated;

        public Eratosthenes(int limit)
            : base(limit)
        {
            if (mbaOddEliminated == null) FindPrimes();
        }

        public override IEnumerator<int> GetEnumerator()
        {
            yield return 2;
            for (int lsN = 3; lsN <= msLimit; lsN+=2)
                if (!mbaOddEliminated[lsN>>1]) yield return lsN;
        }

        private void FindPrimes()
        {
            mbaOddEliminated = new BitArray((msLimit>>1) + 1);
            int lsSQRT = (int)Math.Sqrt(msLimit);
            for (int lsN = 3; lsN < lsSQRT + 1; lsN += 2)
                if (!mbaOddEliminated[lsN>>1])
                    for (int lsM = lsN*lsN; lsM <= msLimit; lsM += lsN<<1)
                        mbaOddEliminated[lsM>>1] = true;
        }
    }
}

如果你让阿特金跑得更快，请告诉我！

score 0 · Accepted Answer

这是对 Eratosthenes 筛的改进，使用自定义 FixBitArrays 和不安全代码来获得速度结果，这比我以前的 Eratosthenes 算法快 225%，并且该类是独立的（这不是多线程的 - Eratosthenes 与多线程不兼容），在 AMD Phenom II X4 965 处理器上，我可以在 9,261 毫秒内计算 Primes 到 1,000,000,000 的限制：

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.UInt32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Eratosthenes : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddNotPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Eratosthenes generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Eratosthenes(PrimeType limit)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mpLimit = limit;

            FindPrimes();
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddNotPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            // Cache Sqrt of limit.
            PrimeType lpSQRT = (PrimeType)Math.Sqrt(mpLimit);

            // Fix the bit array for pointer access
            fixed (BitArrayType* lpbOddNotPrime = &mbaOddNotPrime[0])
                // Scan primes up to lpSQRT
                for (PrimeType lpN = 3; lpN <= lpSQRT; lpN += 2)
                    // If the current bit value for index lpN is cleared (prime)
                    if (
                            (
                                lpbOddNotPrime[(lpN >> 1) / cbBitsPerBlock] & 
                                ((BitArrayType)1 << (BitsPerBlockType)((lpN >> 1) % cbBitsPerBlock))
                            ) == 0
                        )
                        // Leave it cleared (prime) and mark all multiples of lpN*2 from lpN*lpN as not prime
                        for (PrimeType lpM = lpN * lpN; lpM <= mpLimit; lpM += lpN << 1)
                            // Set as not prime
                            lpbOddNotPrime[(lpM >> 1) / cbBitsPerBlock] |= 
                                (BitArrayType)((BitArrayType)1 << (BitsPerBlockType)((lpM >> 1) % cbBitsPerBlock));
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            return (bits[index / cbBitsPerBlock] & ((BitArrayType)1 << (BitsPerBlockType)(index % cbBitsPerBlock))) != 0;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                PrimeType lptCount = 0;
                foreach (PrimeType liPrime in this)
                    lptCount++;
                return lptCount;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddNotPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            // Two always prime.
            yield return 2;

            // Start at first block, second MSB.
            int liBlock = 0;
            byte lbBit = 1;
            BitArrayType lbaCurrent = mbaOddNotPrime[0] >> 1;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 3; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 0)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value.
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    // Move to first bit of next block.
                    lbBit = 0;
                    liBlock++;
                    lbaCurrent = mbaOddNotPrime[liBlock];
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator for the prime numbers.</returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

在 1,000,000,000 中发现的素数：50,847,534 在 9,261 毫秒内

c# - C#：阿特金筛法的实现

6 回答 6

Related

Reference