c++ - SSE整数除法？

Question

_mm_div_ps 用于浮点值除法，_mm_mullo_epi16 用于整数乘法。但是有整数除法（16位值）吗？我该如何进行这样的划分？

score 13 · Accepted Answer

数学说确实有可能走得更快

如果使用单个除数进行除法，Agner Fog 的 ( http://www.agner.org/optimize/#vectorclass ) 方法效果很好。此外，如果除数在编译时已知，或者在运行时不经常更改，则此方法还有更多好处。

但是，当对元素执行 SIMD 除法__m128i时，编译时除数和被除数都没有信息可用时，我们别无选择，只能转换为浮点数并执行计算。另一方面，使用_mm_div_ps不会给我们带来惊人的速度提升，因为这条指令在大多数微架构上具有 11 到 14 个周期的可变延迟，如果我们考虑 Knights Landing，有时可以达到 38 个周期。最重要的是，该指令没有完全流水线化，并且根据微架构具有 3-6 个周期的倒数吞吐量。

但是，我们可以避免_mm_div_ps并_mm_rcp_ss改为使用。

不幸__m128 _mm_rcp_ss (__m128 a)的是，它之所以快只是因为它提供了近似值。即（取自英特尔内部指南）：

计算 a 中低位单精度（32 位）浮点元素的近似倒数，将结果存储在 dst 的低位元素中，并将高位 3 个压缩元素从 a 复制到 dst 的高位元素。此近似值的最大相对误差小于 1.5*2^-12。

因此，为了从中受益_mm_rcp_ss，我们需要补偿近似造成的损失。Niels Möller 和 Torbjörn Granlund在改进的不变量整数除法中提供了这方面的出色工作：

由于当前处理器中缺乏有效的除法指令，除法是使用除数倒数的预先计算的单字近似值作为乘法执行的，然后是几个调整步骤。

要计算 16 位有符号整数除法，我们只需要一个调整步骤，并据此确定我们的解决方案。

SSE2

static inline __m128i _mm_div_epi16(const __m128i &a_epi16, const __m128i &b_epi16) {
    //
    // Setup the constants.
    //
    const __m128  two     = _mm_set1_ps(2.00000051757f);
    const __m128i lo_mask = _mm_set1_epi32(0xFFFF);
    //
    // Convert to two 32-bit integers
    //
    const __m128i a_hi_epi32       = _mm_srai_epi32(a_epi16, 16);
    const __m128i a_lo_epi32_shift = _mm_slli_epi32(a_epi16, 16);
    const __m128i a_lo_epi32       = _mm_srai_epi32(a_lo_epi32_shift, 16);
    const __m128i b_hi_epi32       = _mm_srai_epi32(b_epi16, 16);
    const __m128i b_lo_epi32_shift = _mm_slli_epi32(b_epi16, 16);
    const __m128i b_lo_epi32       = _mm_srai_epi32(b_lo_epi32_shift, 16);
    //
    // Convert to 32-bit floats
    //
    const __m128 a_hi = _mm_cvtepi32_ps(a_hi_epi32);
    const __m128 a_lo = _mm_cvtepi32_ps(a_lo_epi32);
    const __m128 b_hi = _mm_cvtepi32_ps(b_hi_epi32);
    const __m128 b_lo = _mm_cvtepi32_ps(b_lo_epi32);
    //
    // Calculate the reciprocal
    //
    const __m128 b_hi_rcp = _mm_rcp_ps(b_hi);
    const __m128 b_lo_rcp = _mm_rcp_ps(b_lo);
    //
    // Calculate the inverse
    //
    #ifdef __FMA__
        const __m128 b_hi_inv_1 = _mm_fnmadd_ps(b_hi_rcp, b_hi, two);
        const __m128 b_lo_inv_1 = _mm_fnmadd_ps(b_lo_rcp, b_lo, two);
    #else
        const __m128 b_mul_hi   = _mm_mul_ps(b_hi_rcp, b_hi);
        const __m128 b_mul_lo   = _mm_mul_ps(b_lo_rcp, b_lo);
        const __m128 b_hi_inv_1 = _mm_sub_ps(two, b_mul_hi);
        const __m128 b_lo_inv_1 = _mm_sub_ps(two, b_mul_lo);
    #endif
    //
    // Compensate for the loss
    //
    const __m128 b_hi_rcp_1 = _mm_mul_ps(b_hi_rcp, b_hi_inv_1);
    const __m128 b_lo_rcp_1 = _mm_mul_ps(b_lo_rcp, b_lo_inv_1);
    //
    // Perform the division by multiplication
    //
    const __m128 hi = _mm_mul_ps(a_hi, b_hi_rcp_1);
    const __m128 lo = _mm_mul_ps(a_lo, b_lo_rcp_1);
    //
    // Convert back to integers
    //
    const __m128i hi_epi32 = _mm_cvttps_epi32(hi);
    const __m128i lo_epi32 = _mm_cvttps_epi32(lo);
    //
    // Zero-out the unnecessary parts
    //
    const __m128i hi_epi32_shift = _mm_slli_epi32(hi_epi32, 16);
    #ifdef __SSE4_1__
        //
        // Blend the bits, and return
        //
        return _mm_blend_epi16(lo_epi32, hi_epi32_shift, 0xAA);
    #else
        //
        // Blend the bits, and return
        //
        const __m128i lo_epi32_mask = _mm_and_si128(lo_epi32, const_mm_div_epi16_lo_mask);
        return _mm_or_si128(hi_epi32_shift, lo_epi32_mask);
    #endif
}

此解决方案只能使用，如果可用SSE2，将使用。FMA但是，使用普通除法可能与使用近似值一样快（甚至更快）。

在存在这种解决方案的情况下可以改进，因为可以使用一个寄存器AVX同时处理高和低部分。AVX

验证

由于我们只处理 16 位，我们可以使用蛮力测试在几秒钟内轻松验证解决方案的正确性：

 void print_epi16(__m128i a)
{
    int i; int16_t tmp[8];
    _mm_storeu_si128( (__m128i*) tmp, a);

    for (i = 0; i < 8; i += 1) {
        printf("%8d ", (int) tmp[i]);
    }
    printf("\n");
}

bool run_mm_div_epi16(const int16_t *a, const int16_t *b)
{
    const size_t n = 8;
    int16_t result_expected[n];
    int16_t result_obtained[n];
    //
    // Derive the expected result
    //
    for (size_t i = 0; i < n; i += 1) {
        result_expected[i] = a[i] / b[i];
    }
    //
    // Now perform the computation
    //
    const __m128i va = _mm_loadu_si128((__m128i *) a);
    const __m128i vb = _mm_loadu_si128((__m128i *) b);
    const __m128i vr = _mm_div_epi16(va, vb);
    _mm_storeu_si128((__m128i *) result_obtained, vr);
    //
    // Check for array equality
    //
    bool eq = std::equal(result_obtained, result_obtained + n, result_expected);
    if (!eq) {
        cout << "Testing of _mm_div_epi16 failed" << endl << endl;
        cout << "a: ";
        print_epi16(va);
        cout << "b: ";
        print_epi16(vb);
        cout << endl;
        cout << "results_obtained: ";
        print_epi16(vr);
        cout << "results_expected: ";
        print_epi16(_mm_loadu_si128((__m128i *) result_expected));
        cout << endl;
    }
    return eq;
}

void test_mm_div_epi16()
{
    const int n = 8;
    bool correct = true;
    //
    // Brute-force testing
    //
    int16_t a[n];
    int16_t b[n];

    for (int32_t i = INT16_MIN; correct && i <= INT16_MAX; i += n) {
        for (int32_t j = 0; j < n; j += 1) {
            a[j] = (int16_t) (i + j);
        }
        for (int32_t j = INT16_MIN; correct && j < 0; j += 1) {
            const __m128i jv = _mm_set1_epi16((int16_t) j);
            _mm_storeu_si128((__m128i *) b, jv);
            correct = correct && run_mm_div_epi16(a, b);
        }
        for (int32_t j = 1; correct && j <= INT16_MAX; j += 1) {
            const __m128i jv = _mm_set1_epi16((int16_t) j);
            _mm_storeu_si128((__m128i *) b, jv);
            correct = correct && run_mm_div_epi16(a, b);
        }
    }
    if (correct) {
        cout << "Done!" << endl;
    } else {
        cout << "_mm_div_epi16 can not be validated" << endl;
    }
}

AVX2

有了上面的解决方案，AVX2实现就很简单了：

static inline __m256i _mm256_div_epi16(const __m256i &a_epi16, const __m256i &b_epi16) {
    //
    // Setup the constants.
    //
    const __m256 two = _mm256_set1_ps(2.00000051757f);
    //
    // Convert to two 32-bit integers
    //
    const __m256i a_hi_epi32       = _mm256_srai_epi32(a_epi16, 16);
    const __m256i a_lo_epi32_shift = _mm256_slli_epi32(a_epi16, 16);
    const __m256i a_lo_epi32       = _mm256_srai_epi32(a_lo_epi32_shift, 16);
    const __m256i b_hi_epi32       = _mm256_srai_epi32(b_epi16, 16);
    const __m256i b_lo_epi32_shift = _mm256_slli_epi32(b_epi16, 16);
    const __m256i b_lo_epi32       = _mm256_srai_epi32(b_lo_epi32_shift, 16);
    //
    // Convert to 32-bit floats
    //
    const __m256 a_hi = _mm256_cvtepi32_ps(a_hi_epi32);
    const __m256 a_lo = _mm256_cvtepi32_ps(a_lo_epi32);
    const __m256 b_hi = _mm256_cvtepi32_ps(b_hi_epi32);
    const __m256 b_lo = _mm256_cvtepi32_ps(b_lo_epi32);
    //
    // Calculate the reciprocal
    //
    const __m256 b_hi_rcp = _mm256_rcp_ps(b_hi);
    const __m256 b_lo_rcp = _mm256_rcp_ps(b_lo);
    //
    // Calculate the inverse
    //
    const __m256 b_hi_inv_1 = _mm256_fnmadd_ps(b_hi_rcp, b_hi, two);
    const __m256 b_lo_inv_1 = _mm256_fnmadd_ps(b_lo_rcp, b_lo, two);
    //
    // Compensate for the loss
    //
    const __m256 b_hi_rcp_1 = _mm256_mul_ps(b_hi_rcp, b_hi_inv_1);
    const __m256 b_lo_rcp_1 = _mm256_mul_ps(b_lo_rcp, b_lo_inv_1);
    //
    // Perform the division by multiplication
    //
    const __m256 hi = _mm256_mul_ps(a_hi, b_hi_rcp_1);
    const __m256 lo = _mm256_mul_ps(a_lo, b_lo_rcp_1);
    //
    // Convert back to integers
    //
    const __m256i hi_epi32 = _mm256_cvttps_epi32(hi);
    const __m256i lo_epi32 = _mm256_cvttps_epi32(lo);
    //
    // Blend the low and the high-parts
    //
    const __m256i hi_epi32_shift = _mm256_slli_epi32(hi_epi32, 16);
    return _mm256_blend_epi16(lo_epi32, hi_epi32_shift, 0xAA);
}

我们可以使用上述相同的方法来执行代码验证。

表现

我们可以使用每周期的测量触发器 (F/C) 来评估性能。在这种情况下，我们希望查看每个周期可以执行多少个除法。为此，我们定义了两个向量a并b执行逐点除法。两者a和都使用xorshift32b填充随机数据，初始化为uint32_t state = 3853970173;

我RDTSC用来测量周期，使用暖缓存执行 15 次重复，并使用中值作为结果。为了避免频率缩放和资源共享对测量的影响，Turbo Boost 和 Hyper-Threading 被禁用。为了运行代码，我使用Intel Xeon CPU E3-1285L v33.10GHz Haswell，32GB RAM 和 25.6GB/s 带宽到主内存，运行 Debian GNU/Linux 8 (jessie) kernel 3.16.43-2+deb8u3，. gcc使用的是4.9.2-10. 结果如下：

纯SSE2实现

我们将普通除法与上面提出的算法进行比较：

===============================================================
= Compiler & System info
===============================================================
Current CPU          : Intel(R) Xeon(R) CPU E3-1285L v3 @ 3.10GHz
CXX Compiler ID      : GNU
CXX Compiler Path    : /usr/bin/c++
CXX Compiler Version : 4.9.2
CXX Compiler Flags   : -O3 -std=c++11 -msse2 -mno-fma

--------------------------------------------------------------------------------
|   Size    | Division F/C |  Division B/W   | Approx. F/C | Approximation B/W |
--------------------------------------------------------------------------------
|       128 |  0.5714286   |   26911.45 MB/s |  0.5019608  |   23634.21 MB/s   |
|       256 |  0.5714286   |   26909.17 MB/s |  0.5039370  |   23745.44 MB/s   |
|       512 |  0.5707915   |   26928.14 MB/s |  0.5039370  |   23763.79 MB/s   |
|      1024 |  0.5707915   |   26936.33 MB/s |  0.5039370  |   23776.85 MB/s   |
|      2048 |  0.5709507   |   26938.51 MB/s |  0.5039370  |   23780.25 MB/s   |
|      4096 |  0.5708711   |   26940.56 MB/s |  0.5039990  |   23782.65 MB/s   |
|      8192 |  0.5708711   |   26940.16 MB/s |  0.5039370  |   23781.85 MB/s   |
|     16384 |  0.5704735   |   26921.76 MB/s |  0.4954040  |   23379.24 MB/s   |
|     32768 |  0.5704537   |   26921.26 MB/s |  0.4954639  |   23382.13 MB/s   |
|     65536 |  0.5703147   |   26914.53 MB/s |  0.4943539  |   23330.13 MB/s   |
|    131072 |  0.5691680   |   26860.21 MB/s |  0.4929539  |   23264.40 MB/s   |
|    262144 |  0.5690618   |   26855.60 MB/s |  0.4929187  |   23262.22 MB/s   |
|    524288 |  0.5691378   |   26858.75 MB/s |  0.4929488  |   23263.56 MB/s   |
|   1048576 |  0.5677474   |   26794.14 MB/s |  0.4918968  |   23214.34 MB/s   |
|   2097152 |  0.5371243   |   25348.39 MB/s |  0.4700511  |   22183.07 MB/s   |
|   4194304 |  0.5128146   |   24200.82 MB/s |  0.4529809  |   21377.28 MB/s   |
|   8388608 |  0.5036971   |   23770.36 MB/s |  0.4438345  |   20945.84 MB/s   |
|  16777216 |  0.5005390   |   23621.14 MB/s |  0.4409909  |   20811.32 MB/s   |
|  33554432 |  0.4992792   |   23561.90 MB/s |  0.4399777  |   20763.49 MB/s   |
--------------------------------------------------------------------------------

我们可以观察到普通分割将如何比建议的近似步骤略快。在这种情况下，我们可以得出结论SSE2，在 Haswell 微架构上使用近似值将不是最优的。

但是，如果我们在旧的 Sandy Bridge 机器上运行相同的结果，例如 Intel(R) Xeon(R) CPU X5680 @ 3.33GHz，我们已经可以看到近似值的好处：

===============================================================
= Compiler & System info
===============================================================
Current CPU          : Intel(R) Xeon(R) CPU X5680  @ 3.33GHz
CXX Compiler ID      : GNU
CXX Compiler Path    : /usr/bin/c++
CXX Compiler Version : 4.8.5
CXX Compiler Flags   : -O3 -std=c++11 -msse2 -mno-fma

--------------------------------------------------------------------------------
|   Size    | Division F/C |  Division B/W   | Approx. F/C | Approximation B/W |
--------------------------------------------------------------------------------
|       128 |  0.2857143   |   14511.41 MB/s |  0.3720930  |   18899.89 MB/s   |
|       256 |  0.2853958   |   14512.51 MB/s |  0.3715530  |   18898.91 MB/s   |
|       512 |  0.2853958   |   14510.53 MB/s |  0.3715530  |   18896.44 MB/s   |
|      1024 |  0.2853162   |   14511.81 MB/s |  0.3700759  |   18824.00 MB/s   |
|      2048 |  0.2853162   |   14511.04 MB/s |  0.3708130  |   18860.31 MB/s   |
|      4096 |  0.2852964   |   14511.16 MB/s |  0.3711826  |   18879.27 MB/s   |
|      8192 |  0.2852666   |   14510.23 MB/s |  0.3713172  |   18886.39 MB/s   |
|     16384 |  0.2852616   |   14509.86 MB/s |  0.3712920  |   18885.60 MB/s   |
|     32768 |  0.2852244   |   14507.93 MB/s |  0.3712709  |   18884.86 MB/s   |
|     65536 |  0.2851003   |   14501.41 MB/s |  0.3701114  |   18826.14 MB/s   |
|    131072 |  0.2850711   |   14499.95 MB/s |  0.3685017  |   18743.58 MB/s   |
|    262144 |  0.2850745   |   14500.47 MB/s |  0.3684799  |   18742.78 MB/s   |
|    524288 |  0.2848062   |   14486.66 MB/s |  0.3681040  |   18723.63 MB/s   |
|   1048576 |  0.2846679   |   14479.64 MB/s |  0.3671284  |   18674.02 MB/s   |
|   2097152 |  0.2840133   |   14446.52 MB/s |  0.3664623  |   18640.01 MB/s   |
|   4194304 |  0.2745241   |   13963.13 MB/s |  0.3488823  |   17745.24 MB/s   |
|   8388608 |  0.2741900   |   13946.39 MB/s |  0.3476036  |   17680.37 MB/s   |
|  16777216 |  0.2740689   |   13940.32 MB/s |  0.3477076  |   17685.97 MB/s   |
|  33554432 |  0.2746752   |   13970.75 MB/s |  0.3482017  |   17711.36 MB/s   |
--------------------------------------------------------------------------------

RCP如果 Nehalem （假设它有支持）的话，看看这在更老的机器上会如何表现会更好。

SSE41+FMA实施

我们将普通除法与上面提出的算法进行比较，启用FMA和SSE41

===============================================================
= Compiler & System info
===============================================================
Current CPU          : Intel(R) Xeon(R) CPU E3-1285L v3 @ 3.10GHz
CXX Compiler ID      : GNU
CXX Compiler Path    : /usr/bin/c++
CXX Compiler Version : 4.9.2
CXX Compiler Flags   : -O3 -std=c++11 -msse4.1 -mfma

--------------------------------------------------------------------------------
|   Size    | Division F/C |  Division B/W   | Approx. F/C | Approximation B/W |
--------------------------------------------------------------------------------
|       128 |  0.5714286   |   26884.20 MB/s |  0.5423729  |   25506.41 MB/s   |
|       256 |  0.5701559   |   26879.92 MB/s |  0.5412262  |   25503.95 MB/s   |
|       512 |  0.5701559   |   26904.68 MB/s |  0.5423729  |   25584.65 MB/s   |
|      1024 |  0.5704735   |   26911.46 MB/s |  0.5429480  |   25622.57 MB/s   |
|      2048 |  0.5704735   |   26915.03 MB/s |  0.5433802  |   25640.09 MB/s   |
|      4096 |  0.5703941   |   26917.72 MB/s |  0.5435965  |   25651.63 MB/s   |
|      8192 |  0.5703544   |   26915.85 MB/s |  0.5436687  |   25656.76 MB/s   |
|     16384 |  0.5699972   |   26898.44 MB/s |  0.5262583  |   24834.54 MB/s   |
|     32768 |  0.5699873   |   26898.93 MB/s |  0.5262076  |   24833.21 MB/s   |
|     65536 |  0.5698882   |   26894.48 MB/s |  0.5250567  |   24778.35 MB/s   |
|    131072 |  0.5697024   |   26885.50 MB/s |  0.5224302  |   24654.59 MB/s   |
|    262144 |  0.5696950   |   26884.72 MB/s |  0.5223095  |   24649.49 MB/s   |
|    524288 |  0.5696937   |   26885.37 MB/s |  0.5223308  |   24650.21 MB/s   |
|   1048576 |  0.5690340   |   26854.14 MB/s |  0.5220133  |   24634.71 MB/s   |
|   2097152 |  0.5455717   |   25746.56 MB/s |  0.5041949  |   23794.65 MB/s   |
|   4194304 |  0.5125461   |   24188.11 MB/s |  0.4756604  |   22447.05 MB/s   |
|   8388608 |  0.5043430   |   23800.67 MB/s |  0.4659974  |   21991.51 MB/s   |
|  16777216 |  0.5017375   |   23677.94 MB/s |  0.4614457  |   21776.58 MB/s   |
|  33554432 |  0.5005865   |   23623.50 MB/s |  0.4596277  |   21690.63 MB/s   |
--------------------------------------------------------------------------------

FMA+SSE4.1确实给我们带来了一定程度的改进，但这还不够好。

AVX2+FMA实施

AVX2 最后，我们可以看到将普通除法与近似方法进行比较的真正好处：

===============================================================
= Compiler & System info
===============================================================
Current CPU          : Intel(R) Xeon(R) CPU E3-1285L v3 @ 3.10GHz
CXX Compiler ID      : GNU
CXX Compiler Path    : /usr/bin/c++
CXX Compiler Version : 4.9.2
CXX Compiler Flags   : -O3 -std=c++11 -march=haswell

--------------------------------------------------------------------------------
|   Size    | Division F/C |  Division B/W   | Approx. F/C | Approximation B/W |
--------------------------------------------------------------------------------
|       128 |  0.5663717   |   26672.73 MB/s |  0.9481481  |   44627.89 MB/s   |
|       256 |  0.5651214   |   26653.72 MB/s |  0.9481481  |   44651.56 MB/s   |
|       512 |  0.5644983   |   26640.36 MB/s |  0.9463956  |   44660.99 MB/s   |
|      1024 |  0.5657459   |   26689.41 MB/s |  0.9552239  |   45044.21 MB/s   |
|      2048 |  0.5662151   |   26715.40 MB/s |  0.9624060  |   45405.33 MB/s   |
|      4096 |  0.5663717   |   26726.27 MB/s |  0.9671783  |   45633.64 MB/s   |
|      8192 |  0.5664500   |   26732.42 MB/s |  0.9688941  |   45724.83 MB/s   |
|     16384 |  0.5699377   |   26896.04 MB/s |  0.9092624  |   42909.11 MB/s   |
|     32768 |  0.5699675   |   26897.85 MB/s |  0.9087077  |   42883.21 MB/s   |
|     65536 |  0.5699625   |   26898.59 MB/s |  0.9001456  |   42480.91 MB/s   |
|    131072 |  0.5699253   |   26896.38 MB/s |  0.8926057  |   42124.09 MB/s   |
|    262144 |  0.5699117   |   26895.58 MB/s |  0.8928610  |   42137.13 MB/s   |
|    524288 |  0.5698622   |   26892.87 MB/s |  0.8928002  |   42133.63 MB/s   |
|   1048576 |  0.5685829   |   26833.13 MB/s |  0.8894302  |   41974.25 MB/s   |
|   2097152 |  0.5558453   |   26231.90 MB/s |  0.8371921  |   39508.55 MB/s   |
|   4194304 |  0.5224387   |   24654.67 MB/s |  0.7436747  |   35094.81 MB/s   |
|   8388608 |  0.5143588   |   24273.46 MB/s |  0.7185252  |   33909.08 MB/s   |
|  16777216 |  0.5107452   |   24103.19 MB/s |  0.7133449  |   33664.28 MB/s   |
|  33554432 |  0.5101245   |   24074.10 MB/s |  0.7125114  |   33625.03 MB/s   |
--------------------------------------------------------------------------------

结论

这种方法绝对可以提供对普通除法的加速。实际上可以达到多少加速，这实际上取决于底层架构，以及该部门如何与应用程序逻辑的其余部分进行交互。

score 10 · Accepted Answer

请参阅 Agner Fog 的矢量类，他实现了一个快速算法，用 SSE/AVX 对 8 位、16 位和 32 位字（但不是 64 位）进行整数除法 http://www.agner.org/优化/#vectorclass

在文件 vectori128.h 中查找代码和算法的描述作为他写得很好的手册 VectorClass.pdf

这是他手册中描述算法的片段。

“整数除法x86指令集及其扩展中没有对整数向量除法有用的指令，如果存在这样的指令会很慢。因此，向量类库正在使用一种快速整数除法的算法。这个算法的基本原理可以用这个公式来表示： a / b ≈ a * (2n / b) >> n 这个计算经过以下步骤： 1. 找到一个合适的 n 值 2. 计算 2n / b 3.计算舍入误差的必要修正 4. 进行乘法和右移并应用舍入误差修正

如果多个数字除以相同的除数 b，则此公式是有利的。步骤 1、2 和 3 只需要执行一次，而针对被除数 a 的每个值重复步骤 4。数学细节在文件 vectori128.h 中描述。（另请参见 T. Granlund 和 PL Montgomery：使用乘法除以不变整数，SIGPLAN 会议记录。”...

编辑：在文件 vectori128.h 的末尾附近显示了如何使用标量变量进行短除法 “计算用于快速除法的参数比进行除法需要更多时间。因此，使用相同的除数是有利的对象多次。例如，要将 80 个无符号短整数除以 10：

short x = 10;
uint16_t dividends[80], quotients[80];         // numbers to work with
Divisor_us div10(x);                          // make divisor object for dividing by 10
Vec8us temp;                                   // temporary vector
for (int i = 0; i < 80; i += 8) {              // loop for 4 elements per iteration
    temp.load(dividends+i);                    // load 4 elements
    temp /= div10;                             // divide each element by 10
    temp.store(quotients+i);                   // store 4 elements
}

"

编辑：整数除以短裤向量

#include <stdio.h>
#include "vectorclass.h"

int main() {    
    short numa[] = {10, 20, 30, 40, 50, 60, 70, 80};
    short dena[] = {10, 20, 30, 40, 50, 60, 70, 80};

    Vec8s num = Vec8s().load(numa);
    Vec8s den = Vec8s().load(dena);

    Vec4f num_low = to_float(extend_low(num));
    Vec4f num_high = to_float(extend_high(num));
    Vec4f den_low = to_float(extend_low(den));
    Vec4f den_high = to_float(extend_high(den));

    Vec4f qf_low = num_low/den_low;
    Vec4f qf_high = num_high/den_high;
    Vec4i q_low = truncate_to_int(qf_low);
    Vec4i q_high = truncate_to_int(qf_high);

    Vec8s q = compress(q_low, q_high);
    for(int i=0; i<8; i++) {
        printf("%d ", q[i]);
    } printf("\n");
}

score 3 · Accepted Answer

对于 8 位除法，可以通过创建幻数表来实现。

参见“黑客的乐趣”，第 238 页

签：

__m128i _mm_div_epi8(__m128i a, __m128i b)
{
    __m128i abs_b = _mm_abs_epi8(b);

    static const uint16_t magic_number_table[129] =
    {
        0x0000, 0x0000, 0x8080, 0x5580, 0x4040, 0x3380, 0x2ac0, 0x24c0, 0x2020, 0x1c80, 0x19c0, 0x1760, 0x1560, 0x13c0, 0x1260, 0x1120,
        0x1010, 0x0f20, 0x0e40, 0x0d80, 0x0ce0, 0x0c40, 0x0bb0, 0x0b30, 0x0ab0, 0x0a40, 0x09e0, 0x0980, 0x0930, 0x08e0, 0x0890, 0x0850,
        0x0808, 0x07d0, 0x0790, 0x0758, 0x0720, 0x06f0, 0x06c0, 0x0698, 0x0670, 0x0640, 0x0620, 0x05f8, 0x05d8, 0x05b8, 0x0598, 0x0578,
        0x0558, 0x0540, 0x0520, 0x0508, 0x04f0, 0x04d8, 0x04c0, 0x04b0, 0x0498, 0x0480, 0x0470, 0x0458, 0x0448, 0x0438, 0x0428, 0x0418,
        0x0404, 0x03f8, 0x03e8, 0x03d8, 0x03c8, 0x03b8, 0x03ac, 0x03a0, 0x0390, 0x0388, 0x0378, 0x0370, 0x0360, 0x0358, 0x034c, 0x0340,
        0x0338, 0x032c, 0x0320, 0x0318, 0x0310, 0x0308, 0x02fc, 0x02f4, 0x02ec, 0x02e4, 0x02dc, 0x02d4, 0x02cc, 0x02c4, 0x02bc, 0x02b4,
        0x02ac, 0x02a8, 0x02a0, 0x0298, 0x0290, 0x028c, 0x0284, 0x0280, 0x0278, 0x0274, 0x026c, 0x0268, 0x0260, 0x025c, 0x0258, 0x0250,
        0x024c, 0x0248, 0x0240, 0x023c, 0x0238, 0x0234, 0x022c, 0x0228, 0x0224, 0x0220, 0x021c, 0x0218, 0x0214, 0x0210, 0x020c, 0x0208,
        0x0202
    };

    Uint8 load_den[16];
    _mm_storeu_si128((__m128i*)load_den, abs_b);

    uint16_t mul[16];

    for (size_t i = 0; i < 16; i++)
    {
        uint16_t cur_den = load_den[i];
        mul[i] = magic_number_table[cur_den];
    }
    // for denominator 1, magic number is 0x10080 that 16bit-overflow occurs.
    __m128i one = _mm_set1_epi8(1);
    __m128i is_one = _mm_cmpeq_epi8(abs_b, one);

    // -128/-128 is a special case where magic number does not work.
    __m128i v80 = _mm_set1_epi8(0x80);
    __m128i is_80_a = _mm_cmpeq_epi8(a, v80);
    __m128i is_80_b = _mm_cmpeq_epi8(b, v80);
    __m128i is_80 = _mm_and_si128(is_80_a, is_80_b);

    // __m128i zero = _mm_setzero_si128();
    // __m128i less_a = _mm_cmpgt_epi8(zero, a);
    // __m128i less_b = _mm_cmpgt_epi8(zero, b);
    // __m128i  sign = _mm_xor_si128(less_a, less_b);
    __m128i abs_a = _mm_abs_epi8(a);
#if 0
    __m128i p = _mm_unpacklo_epi8(abs_a, zero);
    __m128i q = _mm_unpackhi_epi8(abs_a, zero);
    __m256i c = _mm256_castsi128_si256(p);
    c = _mm256_insertf128_si256(c, q, 1);
#else
    // Thanks to Peter Cordes
    __m256i c = _mm256_cvtepu8_epi16(abs_a);
#endif
    __m256i magic = _mm256_loadu_si256((const __m256i*)mul);
    __m256i high = _mm256_mulhi_epu16(magic, c);
    __m128i v0h = _mm256_extractf128_si256(high, 0);
    __m128i v0l = _mm256_extractf128_si256(high, 1);
    __m128i res = _mm_packus_epi16(v0h, v0l);
    __m128i div = _mm_blendv_epi8(res, abs_a, is_one);
    // __m128i neg = _mm_sub_epi8(zero, div);
    // __m128i select = _mm_blendv_epi8(div, neg, sign);
    __m128i select = _mm_sign_epi8(div, _mm_or_si128(_mm_xor_si128(a, b), one));
    return _mm_blendv_epi8(select, one, is_80);
}

未签名：

__m128i _mm_div_epu8(__m128i n, __m128i den)
{
    static const uint16_t magic_number_table[256] =
    {
        0x0001, 0x0000, 0x8000, 0x5580, 0x4000, 0x3340, 0x2ac0, 0x04a0, 0x2000, 0x1c80, 0x19a0, 0x0750, 0x1560, 0x13c0, 0x0250, 0x1120,
        0x1000, 0x0f10, 0x0e40, 0x0d80, 0x0cd0, 0x0438, 0x03a8, 0x0328, 0x0ab0, 0x0a40, 0x09e0, 0x0980, 0x0128, 0x00d8, 0x0890, 0x0048,
        0x0800, 0x07c8, 0x0788, 0x0758, 0x0720, 0x06f0, 0x06c0, 0x0294, 0x0668, 0x0640, 0x021c, 0x05f8, 0x05d8, 0x01b4, 0x0194, 0x0578,
        0x0558, 0x013c, 0x0520, 0x0508, 0x04f0, 0x04d8, 0x04c0, 0x04a8, 0x0094, 0x0480, 0x006c, 0x0458, 0x0448, 0x0034, 0x0024, 0x0014,
        0x0400, 0x03f4, 0x03e4, 0x03d4, 0x03c8, 0x03b8, 0x03ac, 0x039c, 0x0390, 0x0384, 0x0378, 0x036c, 0x0360, 0x0354, 0x014a, 0x0340,
        0x0334, 0x032c, 0x0320, 0x0318, 0x010e, 0x0304, 0x02fc, 0x02f4, 0x02ec, 0x02e4, 0x02dc, 0x02d4, 0x02cc, 0x02c4, 0x02bc, 0x02b4,
        0x02ac, 0x02a4, 0x02a0, 0x0298, 0x0290, 0x028c, 0x0284, 0x007e, 0x0278, 0x0072, 0x026c, 0x0066, 0x0260, 0x025c, 0x0254, 0x0250,
        0x004a, 0x0244, 0x0240, 0x023c, 0x0036, 0x0032, 0x022c, 0x0228, 0x0224, 0x001e, 0x001a, 0x0016, 0x0012, 0x000e, 0x000a, 0x0006,
        0x0200, 0x00fd, 0x01fc, 0x01f8, 0x01f4, 0x01f0, 0x01ec, 0x01e8, 0x01e4, 0x01e0, 0x01dc, 0x01d8, 0x01d6, 0x01d4, 0x01d0, 0x01cc,
        0x01c8, 0x01c4, 0x01c2, 0x01c0, 0x01bc, 0x01b8, 0x01b6, 0x01b4, 0x01b0, 0x01ae, 0x01ac, 0x01a8, 0x01a6, 0x01a4, 0x01a0, 0x019e,
        0x019c, 0x0198, 0x0196, 0x0194, 0x0190, 0x018e, 0x018c, 0x018a, 0x0188, 0x0184, 0x0182, 0x0180, 0x017e, 0x017c, 0x017a, 0x0178,
        0x0176, 0x0174, 0x0172, 0x0170, 0x016e, 0x016c, 0x016a, 0x0168, 0x0166, 0x0164, 0x0162, 0x0160, 0x015e, 0x015c, 0x015a, 0x0158,
        0x0156, 0x0154, 0x0152, 0x0051, 0x0150, 0x014e, 0x014c, 0x014a, 0x0148, 0x0047, 0x0146, 0x0144, 0x0142, 0x0140, 0x003f, 0x013e,
        0x013c, 0x013a, 0x0039, 0x0138, 0x0136, 0x0134, 0x0033, 0x0132, 0x0130, 0x002f, 0x012e, 0x012c, 0x012a, 0x0029, 0x0128, 0x0126,
        0x0025, 0x0124, 0x0122, 0x0021, 0x0120, 0x001f, 0x011e, 0x011c, 0x001b, 0x011a, 0x0019, 0x0118, 0x0116, 0x0015, 0x0114, 0x0013,
        0x0112, 0x0110, 0x000f, 0x010e, 0x000d, 0x010c, 0x000b, 0x010a, 0x0009, 0x0108, 0x0007, 0x0106, 0x0005, 0x0104, 0x0003, 0x0102
    };

    static const uint16_t shift_table[256] =
    {
        0x0001, 0x0100, 0x0100, 0x0080, 0x0100, 0x0040, 0x0040, 0x0020, 0x0100, 0x0080, 0x0020, 0x0010, 0x0020, 0x0040, 0x0010, 0x0020,
        0x0100, 0x0010, 0x0040, 0x0080, 0x0010, 0x0008, 0x0008, 0x0008, 0x0010, 0x0040, 0x0020, 0x0080, 0x0008, 0x0008, 0x0010, 0x0008,
        0x0100, 0x0008, 0x0008, 0x0008, 0x0020, 0x0010, 0x0040, 0x0004, 0x0008, 0x0040, 0x0004, 0x0008, 0x0008, 0x0004, 0x0004, 0x0008,
        0x0008, 0x0004, 0x0020, 0x0008, 0x0010, 0x0008, 0x0040, 0x0008, 0x0004, 0x0080, 0x0004, 0x0008, 0x0008, 0x0004, 0x0004, 0x0004,
        0x0100, 0x0004, 0x0004, 0x0004, 0x0008, 0x0008, 0x0004, 0x0004, 0x0010, 0x0004, 0x0008, 0x0004, 0x0020, 0x0004, 0x0002, 0x0040,
        0x0004, 0x0004, 0x0020, 0x0008, 0x0002, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004,
        0x0004, 0x0004, 0x0020, 0x0008, 0x0010, 0x0004, 0x0004, 0x0002, 0x0008, 0x0002, 0x0004, 0x0002, 0x0020, 0x0004, 0x0004, 0x0010,
        0x0002, 0x0004, 0x0040, 0x0004, 0x0002, 0x0002, 0x0004, 0x0008, 0x0004, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
        0x0100, 0x0001, 0x0004, 0x0008, 0x0004, 0x0010, 0x0004, 0x0008, 0x0004, 0x0020, 0x0004, 0x0008, 0x0002, 0x0004, 0x0010, 0x0004,
        0x0008, 0x0004, 0x0002, 0x0040, 0x0004, 0x0008, 0x0002, 0x0004, 0x0010, 0x0002, 0x0004, 0x0008, 0x0002, 0x0004, 0x0020, 0x0002,
        0x0004, 0x0008, 0x0002, 0x0004, 0x0010, 0x0002, 0x0004, 0x0002, 0x0008, 0x0004, 0x0002, 0x0080, 0x0002, 0x0004, 0x0002, 0x0008,
        0x0002, 0x0004, 0x0002, 0x0010, 0x0002, 0x0004, 0x0002, 0x0008, 0x0002, 0x0004, 0x0002, 0x0020, 0x0002, 0x0004, 0x0002, 0x0008,
        0x0002, 0x0004, 0x0002, 0x0001, 0x0010, 0x0002, 0x0004, 0x0002, 0x0008, 0x0001, 0x0002, 0x0004, 0x0002, 0x0040, 0x0001, 0x0002,
        0x0004, 0x0002, 0x0001, 0x0008, 0x0002, 0x0004, 0x0001, 0x0002, 0x0010, 0x0001, 0x0002, 0x0004, 0x0002, 0x0001, 0x0008, 0x0002,
        0x0001, 0x0004, 0x0002, 0x0001, 0x0020, 0x0001, 0x0002, 0x0004, 0x0001, 0x0002, 0x0001, 0x0008, 0x0002, 0x0001, 0x0004, 0x0001,
        0x0002, 0x0010, 0x0001, 0x0002, 0x0001, 0x0004, 0x0001, 0x0002, 0x0001, 0x0008, 0x0001, 0x0002, 0x0001, 0x0004, 0x0001, 0x0002
    };

    static const uint16_t mask_table[256] =
    {
        0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000,
        0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000,
        0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
        0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000,
        0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000,
        0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff,
        0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
    };

    uint8_t load_den[16];
    _mm_storeu_si128((__m128i*)load_den, den);

    uint16_t mul[16];
    uint16_t mask[16];
    uint16_t shift[16];

    for (size_t i = 0; i < 16; i++)
    {
        const uint16_t cur_den = load_den[i];
        mul[i] = magic_number_table[cur_den];
        mask[i] = mask_table[cur_den];
        shift[i] = shift_table[cur_den];
    }
#if 0
    __m128i a = _mm_unpacklo_epi8(n, _mm_setzero_si128());
    __m128i b = _mm_unpackhi_epi8(n, _mm_setzero_si128());
    __m256i c = _mm256_castsi128_si256(a);
    c = _mm256_insertf128_si256(c, b, 1);
#else
    // Thanks to Peter Cordes
    __m256i c = _mm256_cvtepu8_epi16(n);
#endif
    __m256i magic = _mm256_loadu_si256((const __m256i*)mul);
    __m256i high = _mm256_mulhi_epu16(magic, c);
    __m256i low = _mm256_mullo_epi16(magic, c);
    __m256i low_down = _mm256_srli_epi16(low, 8);
    __m256i high_up = _mm256_slli_epi16(high, 8);
    __m256i low_high = _mm256_or_si256(low_down, high_up);
    __m256i target_up = _mm256_mullo_epi16(c, _mm256_loadu_si256((const __m256i*)shift));
    __m256i cal1 = _mm256_sub_epi16(target_up, low_high);
    __m256i cal2 = _mm256_srli_epi16(cal1, 1);
    __m256i cal3 = _mm256_add_epi16(cal2, low_high);
    __m256i cal4 = _mm256_srli_epi16(cal3, 7);
    __m256i res = _mm256_blendv_epi8(high, cal4, _mm256_loadu_si256((const __m256i*)mask));

    __m128i v0h = _mm256_extractf128_si256(res, 0);
    __m128i v0l = _mm256_extractf128_si256(res, 1);

    return _mm_packus_epi16(v0h, v0l);
}

score 1 · Accepted Answer

由于原来的解决方案，这是为新手准备的：这个Agner Fog 的子程序库在优化方面与我一起发挥了魔力

这是您多次划分同一个变量值的情况（例如在大循环中）

#include <asmlib.h>

unsigned int a, b, d;
unsigned int divisor = any_random_value;
div_u32 OptimumDivision(divisor);
a/OptimumDivision;
b/OptimumDivision;

这适用于 unsigned int - 如果您需要负值div_i32，则在我的测试中使用它会更快，即使手册说相反

我得到大约 3 倍或更多的性能

c++ - SSE整数除法？

4 回答 4

数学说确实有可能走得更快

SSE2

AVX2

表现

结论

Related

Reference