simd - 将 MMX/SSE 指令移植到 AltiVec

Question

让我以此作为开场白。我在 ASM 方面的经验非常有限，在 SIMD 方面的经验更是少之又少。

但碰巧我有以下 MMX/SSE 优化代码，我想移植到 AltiVec 指令以在 PPC/Cell 处理器上使用。

这可能是一个很大的问题。尽管它只有几行代码，但我在试图弄清楚这里发生的事情时遇到了无穷无尽的麻烦。

原函数：

static inline int convolve(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        __m64 m64;
        int i32[2];
    } tmp;
    tmp.i32[0] = 0;
    tmp.i32[1] = 0;
    while (n >= 4) {
        tmp.m64 = _mm_add_pi32(tmp.m64,
                               _mm_madd_pi16(*((__m64 *)a),
                                             *((__m64 *)b)));
        a += 4;
        b += 4;
        n -= 4;
    }
    out = tmp.i32[0] + tmp.i32[1];
    _mm_empty();

    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

关于如何重写它以使用 AltiVec 指令的任何提示？

我的第一次尝试（一次非常错误的尝试）看起来像这样。但它并不完全（甚至是远程）正确。

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector unsigned int m128;
        int i64[2];
    } tmp;

    vector unsigned int zero = {0, 0, 0, 0};

    tmp.i64[0] = 0;
    tmp.i64[1] = 0;
    while (n >= 8) {
        tmp.m128 = vec_add(tmp.m128,
                               vec_msum(*((vector unsigned short *)a),
                                             *((vector unsigned short *)b), zero));

        a += 8;
        b += 8;
        n -= 8;
    }
    out = tmp.i64[0] + tmp.i64[1];
#endif
    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

score 3 · Accepted Answer

离你不远了——我修复了一些小问题，稍微清理了代码，添加了一个测试工具，现在它似乎可以正常工作了：

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>

static int convolve_ref(const short *a, const short *b, int n)
{
    int out = 0;
    int i;

    for (i = 0; i < n; ++i)
    {
        out += a[i] * b[i];
    }

    return out;
}

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector signed int m128;
        int i32[4];
    } tmp;

    const vector signed int zero = {0, 0, 0, 0};

    assert(((unsigned long)a & 15) == 0);
    assert(((unsigned long)b & 15) == 0);

    tmp.m128 = zero;

    while (n >= 8)
    {
        tmp.m128 = vec_msum(*((vector signed short *)a),
                            *((vector signed short *)b), tmp.m128);

        a += 8;
        b += 8;
        n -= 8;
    }

    out = tmp.i32[0] + tmp.i32[1] + tmp.i32[2] + tmp.i32[3];

    while (n --)
        out += (*(a++)) * (*(b++));

    return out;
}

int main(void)
{
    const int n = 100;

    vector signed short _a[n / 8 + 1];
    vector signed short _b[n / 8 + 1];

    short *a = (short *)_a;
    short *b = (short *)_b;

    int sum_ref, sum_test;

    int i;

    for (i = 0; i < n; ++i)
    {
        a[i] = rand();
        b[i] = rand();
    }

    sum_ref = convolve_ref(a, b, n);
    sum_test = convolve_altivec(a, b, n);

    printf("sum_ref = %d\n", sum_ref);
    printf("sum_test = %d\n", sum_test);

    printf("%s\n", sum_ref == sum_test ? "PASS" : "FAIL");

    return 0;
}

score 1 · Accepted Answer

（警告：我所有的 Altivec 体验都来自于在 Xbox360/PS3 上的工作——我不确定它们与其他 Altivec 平台有何不同）。

首先，你应该检查你的指针对齐。大多数向量加载（和存储）操作预计来自 16 字节对齐的地址。如果不是这样，事情通常会在没有警告的情况下继续进行，但您不会获得您期望的数据。

进行未对齐的加载是可能的（但速度较慢），但您基本上必须在数据之前和之后阅读一些内容并将它们组合起来。请参阅Apple 的 Altivec 页面。lvlx在使用和加载指令之前，我也完成了它lvrx，然后将它们组合在一起。

接下来，我不确定您的乘法和加法是否相同。我从未使用过 _mm_madd_pi16 或 vec_msum，所以我不肯定它们是等价的。您应该在调试器中单步执行，并确保它们为相同的输入数据提供相同的输出。另一个可能的区别是他们可能以不同的方式处理溢出（例如模块化与饱和）。

最后但并非最不重要的一点是，您一次计算 4 个整数而不是 2 个整数。因此，您的并集应该包含 4 个整数，最后您应该将所有 4 个整数相加。

simd - 将 MMX/SSE 指令移植到 AltiVec

2 回答 2

Related

Reference