我一直在尝试提高大型(多千兆字节)位数组操作的性能。我不是 SIMD 专家,但似乎 SIMD 在所有情况下都比标量操作慢。我尝试了几种优化,包括循环展开,但无济于事。根据程序集,似乎是因为标量能够使用寄存器。但是,如果我在做一些愚蠢的事情,请告诉我。否则,我很乐意保留标量......它要简单得多。
/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */
#ifdef ENABLE_PREFETCH
#warning "SIMD PREFETCHING ENABLED"
#else
#warning "SIMD PREFETCHING DISABLED"
#endif
#ifdef ENABLE_SIMD_UNROLLING
#warning "UNROLLING SIMD"
#else
#warning "NOT UNROLLING SIMD"
#endif
#ifdef AVOID_TEMP_VARS
#warning "AVOIDING SIMD TEMPORARY VARIABLES"
#else
#warning "USING SIMD TEMPORARY VARIABLES"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <assert.h>
#define __forceinline __attribute__((always_inline))
double
microtime (void)
{
struct timeval time;
gettimeofday(&time, NULL);
return (double) time.tv_sec * 1E6 + (double) time.tv_usec;
}
__forceinline void
simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
const __m128i *wrd_ptr = (__m128i *) src;
const __m128i *wrd_end = (__m128i *) (src + block_size);
__m128i *dst_ptr = (__m128i *) dst;
_mm_empty();
do
{
__m128i xmm1;
__m128i xmm2;
#ifdef ENABLE_SIMD_UNROLLING
# ifdef ENABLE_PREFETCH
_mm_prefetch((src + 512), _MM_HINT_NTA);
# endif
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
#else
# ifdef AVOID_TEMP_VARS
xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr);
# else
xmm1 = _mm_load_si128(wrd_ptr);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
# endif
_mm_store_si128(dst_ptr, xmm1);
++dst_ptr;
++wrd_ptr;
#endif
} while (wrd_ptr < wrd_end);
}
__forceinline void
word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
unsigned int *wrd_ptr = (unsigned int *) src;
unsigned int *wrd_end = (unsigned int *) (src + block_size);
unsigned int *dst_ptr = (unsigned int *) dst;
do
{
dst_ptr[0] &= wrd_ptr[0];
dst_ptr[1] &= wrd_ptr[1];
dst_ptr[2] &= wrd_ptr[2];
dst_ptr[3] &= wrd_ptr[3];
dst_ptr += 4;
wrd_ptr += 4;
} while (wrd_ptr < wrd_end);
}
int
main (int argc, char **argv)
{
unsigned char *dest;
unsigned char *key1;
unsigned char *key2;
size_t minlen = (1024UL * 1024UL * 512UL);
double start_time = 0.0f;
double end_time = 0.0f;
posix_memalign((void *) &key1, sizeof(__m128i), minlen);
posix_memalign((void *) &key2, sizeof(__m128i), minlen);
posix_memalign((void *) &dest, sizeof(__m128i), minlen);
key1[128] = 0xff;
key2[128] = 0x03;
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
free(dest);
free(key2);
free(key1);
return EXIT_SUCCESS;
}
/* vi: set et sw=2 ts=2: */