2

Suppose I have two double values, old and new. I would like to implement a vectorized function that returns old if abs(x-y) < p, and new otherwise.

Here is the code (test.cpp):

#include <emmintrin.h>
#include <iostream>

#define ARRAY_LENGTH 2

int main(void) {
    // x = old value, y = new value, res = result
    double *x, *y, *res;
    posix_memalign((void **)&x, 16, sizeof(double) * ARRAY_LENGTH);
    posix_memalign((void **)&y, 16, sizeof(double) * ARRAY_LENGTH);
    posix_memalign((void **)&res, 16, sizeof(double) * ARRAY_LENGTH);

    double p = 1e-4; // precision
    __m128d sp = _mm_set1_pd(p);
    x[0] = 1.5; y[0] = 1.50011; // x - old value, y - new value
    x[1] = 2.; y[1] = 2.0000001;

    __m128d sx = _mm_load_pd(x);
    __m128d sy = _mm_load_pd(y);

    // sign mask to compute fabs()
    __m128d sign_mask = _mm_set1_pd(-0.);
    // |x-y|
    __m128d absval = _mm_andnot_pd(sign_mask, _mm_sub_pd(sx, sy) );
    // mask of |x-y| < p
    __m128d mask = _mm_cmplt_pd(absval, sp);
    // sres = |x-y| < p ? x : y;
    __m128d sres = _mm_or_pd(
            _mm_and_pd(mask, sx), _mm_andnot_pd(mask, sy) );
    _mm_store_pd(res, sres);
    std::cerr << "res=" << res[0] << "," << res[1] << std::endl;
    return 0;
}

To build:

g++ -std=c++11 -msse4 test.cpp

We first compute fabs(x-y), compare to p, and combine x, y using the obtained mask.

Does anyone see a more efficient way to code this? Thanks.

4

1 回答 1

1

有一种方法可以让这个算法更快一点,但它会降低准确性:

// d = x - y;
__m128d diff = _mm_sub_pd(sx, sy);
// mask of |y - x| < p
__m128d mask = _mm_cmplt_pd(_mm_andnot_pd(sign_mask, diff), sp);
// sres = y + (|y - x| < p) ? (x - y) : 0;
__m128d sres = _mm_add_pd(sy, _mm_and_pd(mask, diff));

另一种方式 - 使用 AVX 或/和单精度。

于 2015-10-23T12:20:39.547 回答