这是 C++ 中的一个解决方案,它更简洁,速度提高了 1.6 倍。它生成的代码对具有深管道和复杂分支预测逻辑的高端微处理器更友好。它生成真/假,没有比较/分支,也没有表查找(没有数据缓存未命中)。
一个半字节有 4 位,因此保存 16 个值之一,我将每个输入中的两个半字节映射到一个无符号值(至少有 16 位),位设置在相应的位位置,指示两个半字节值都存在于输入。然后我AND
两个位集,从而计算集合的交集。最后一个AND
丢弃与 nibble 的任何匹配项0
。
inline unsigned set( unsigned char X ) {
return (1 << (X & 15)) | (1 << (X >> 4));
}
// Return true if a nibble in 'A' matches a non-null nibble in 'B'
inline bool match( unsigned char A, unsigned char B ) {
return set( A ) & set( B ) & ~set( 0 );
}
我已经在 Intel Xeon X5570 @ 2.93GHz 上对其进行了计时,它比问题中的原始速度快 1.6 倍。这是我用来计时的代码:
#include <time.h>
#include <iostream>
bool original( unsigned char A, unsigned char B ) {
unsigned char a1 = A >> 4;
unsigned char a2 = A & 15;
unsigned char b1 = B >> 4;
unsigned char b2 = B & 15;
return (a1 != 0 && (a1 == b1 || a1 == b2)) || (a2 != 0 && (a2 == b1 || a2 == b2));
}
static inline unsigned set( unsigned char X ) {
return (1 << (X & 15)) | (1 << ((X >> 4)&15));
}
bool faster( unsigned char A, unsigned char B ) {
return set( A ) & set( B ) & ~set( 0 );
}
class Timer {
size_t _time;
size_t & _elapsed;
size_t nsec() {
timespec ts;
clock_gettime( CLOCK_REALTIME, &ts );
return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
}
public:
Timer(size_t & elapsed) : _time(nsec()), _elapsed(elapsed) {}
~Timer() { _elapsed = nsec() - _time; }
};
int main()
{
size_t original_nsec, faster_nsec;
const size_t iterations = 200000000ULL;
size_t count = 0;
{ Timer t(original_nsec);
for(size_t i=0; i < iterations; ++i) {
count += original( 0xA5 , i & 0xFF );
}
}
{ Timer t(faster_nsec);
for(size_t i=0; i < iterations; ++i) {
count += faster( 0xA5 , i & 0xFF );
}
}
std::cout << double(original_nsec) / double(faster_nsec)
<< "x faster" << std::endl;
return count > 0 ? 0 : 1;
}
这是输出:
$ g++ -o match -O3 match.cpp -lrt && ./match
1.61564x faster
$