4

我的 c 代码中有几个 uint8_t 数组,我想比较一个任意序列位。例如,我有 bitarray_1 和 bitarray_2,我想将 bitarray_1 的第 13-47 位与 bitarray_2 的第 5-39 位进行比较。最有效的方法是什么?

目前这是我程序中的一个巨大瓶颈,因为我只有一个简单的实现,它将位复制到一个新的临时数组的开头,然后在它们上使用 memcmp。

4

5 回答 5

5

三个词:移位、掩码和异或。

shift 以获得两个位数组的相同内存对齐。如果不是,您将不得不在比较它们之前移动其中一个数组。您的示例可能具有误导性,因为位 13-47 和 5-39 在 8 位地址上具有相同的内存对齐方式。如果您将 14-48 位与 5-39 位进行比较,这将是不正确的。

一旦所有内容都对齐并且超出了为表边界清除的位,异或就足以一次执行所有位的比较。基本上,您只需为每个数组读取一个内存就可以做到这一点,这应该非常有效。

如果两个数组的内存对齐方式与您的示例 memcmp 相同,并且上限和下限的特殊情况可能会更快。

此外,通过 uint32_t(或 64 位架构上的 uint64_t)访问数组也应该比通过 uint8_t 访问更有效。

原理很简单,但正如 Andrejs 所说,实施并非无痛...

这是怎么回事(与@caf 提案的相似之处并非巧合):

/* compare_bit_sequence() */
int compare_bit_sequence(uint8_t s1[], unsigned s1_off, uint8_t s2[], unsigned s2_off,
    unsigned length)
{
const uint8_t mask_lo_bits[] =
    { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t clear_lo_bits[] =
    { 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
uint8_t v1;
uint8_t * max_s1;
unsigned end;
uint8_t lsl;
uint8_t v1_mask;
int delta;

/* Makes sure the offsets are less than 8 bits */
s1 += s1_off >> 3;
s1_off &= 7;

s2 += s2_off >> 3;
s2_off &= 7;

/* Make sure s2 is the sequence with the shorter offset */
if (s2_off > s1_off){
    uint8_t * tmp_s;
    unsigned tmp_off;
    tmp_s = s2; s2 = s1; s1 = tmp_s;
    tmp_off = s2_off; s2_off = s1_off; s1_off = tmp_off;
}
delta = s1_off;

/* handle the beginning, s2 incomplete */ 
if (s2_off > 0){
    delta = s1_off - s2_off;
    v1 = delta
       ? (s1[0] >> delta | s1[1] << (8 - delta)) & clear_lo_bits[delta]
       : s1[0];
       if (length <= 8 - s2_off){
           if ((v1 ^ *s2)
                & clear_lo_bits[s2_off]
                & mask_lo_bits[s2_off + length]){
                return NOT_EQUAL;
           }
           else {
               return EQUAL;
           }
        }
        else{
            if ((v1 ^ *s2) & clear_lo_bits[s2_off]){
                return NOT_EQUAL;
        }
        length -= 8 - s2_off;
    }
    s1++;
    s2++;
}

/* main loop, we test one group of 8 bits of v2 at each loop */
max_s1 = s1 + (length >> 3);
lsl = 8 - delta;
v1_mask = clear_lo_bits[delta];
while (s1 < max_s1)
{
    if ((*s1 >> delta | (*++s1 << lsl & v1_mask)) ^ *s2++)
    {
        return NOT_EQUAL;
    }
}

/* last group of bits v2 incomplete */
end = length & 7;
if (end && ((*s2 ^ *s1 >> delta) & mask_lo_bits[end]))
{
    return NOT_EQUAL;
}

return EQUAL;

}

尚未使用所有可能的优化。一个有希望的方法是使用更大的数据块(一次 64 位或 32 位而不是 8 位),您还可以检测两个数组的偏移量同步的情况,在这种情况下使用 memcmp 而不是主循环,替换用逻辑运算符 & 7 取模 % 8,将 '/8' 替换为 '>> 3' 等,不得不分支代码而不是交换 s1 和 s2 等,但主要目的达到了:只读取一个内存并不是每个数组项的内存写入,因此大部分工作可以在处理器寄存器内进行。

于 2009-10-15T23:56:42.723 回答
1

的第 13 - 47bitarray_1位与 的第 5 - 39 位相同bitarray_1 + 1
将前 3 位 (5 - 7) 与掩码进行比较,将其他位 (8 - 39) 与memcmp().

与其移动和复制这些位,不如以不同的方式表示它们更快。你必须测量。

/* code skeleton */
static char bitarray_1_bis[BIT_ARRAY_SIZE*8+1];
static char bitarray_2_bis[BIT_ARRAY_SIZE*8+1];
static const char *lookup_table[] = {
    "00000000", "00000001", "00000010" /* ... */
    /* 256 strings */
    /* ... */ "11111111"
};

/* copy every bit of bitarray_1 to an element of bitarray_1_bis */
for (k = 0; k < BIT_ARRAY_SIZE; k++) {
    strcpy(bitarray_1_bis + 8*k, lookup_table[bitarray_1[k]]);
    strcpy(bitarray_2_bis + 8*k, lookup_table[bitarray_2[k]]);
}
memcmp(bitarray_1_bis + 13, bitarray_2_bis + 5, 47 - 13 + 1);

您可以(并且应该)将副本限制在尽可能少的范围内。

我不知道它是否更快,但如果是这样的话,我不会感到惊讶。同样,你必须测量。

于 2009-10-15T21:51:09.207 回答
1

最简单的方法是将更复杂的情况转换为更简单的情况,然后解决更简单的情况。

在下面的代码中,do_compare()解决了更简单的情况(序列的偏移量永远不会超过 7 位,s1偏移量总是大于或等于s2,并且序列的长度不为零)。然后该compare_bit_sequence()函数负责将较难的情况转换为较容易的情况,并调用do_compare()来完成这项工作。

这只是对位序列进行单次传递,因此希望这是对您的 copy-and-memcmp 实现的改进。

#define NOT_EQUAL 0
#define EQUAL 1

/* do_compare()
 *
 * Does the actual comparison, but has some preconditions on parameters to
 * simplify things:
 *
 *     length > 0
 *     8 > s1_off >= s2_off
 */
int do_compare(const uint8_t s1[], const unsigned s1_off, const uint8_t s2[],
    const unsigned s2_off, const unsigned length)
{
    const uint8_t mask_lo_bits[] =
        { 0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
    const uint8_t mask_hi_bits[] =
        { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
    const unsigned msb = (length + s1_off - 1) / 8;
    const unsigned s2_shl = s1_off - s2_off;
    const unsigned s2_shr = 8 - s2_shl;
    unsigned n;
    uint8_t s1s2_diff, lo_bits = 0;

    for (n = 0; n <= msb; n++)
    {
        /* Shift s2 so it is aligned with s1, pulling in low bits from
         * the high bits of the previous byte, and store in s1s2_diff */
        s1s2_diff = lo_bits | (s2[n] << s2_shl);

        /* Save the bits needed to fill in the low-order bits of the next
         * byte.  HERE BE DRAGONS - since s2_shr can be 8, this below line
         * only works because uint8_t is promoted to int, and we know that
         * the width of int is guaranteed to be >= 16.  If you change this
         * routine to work with a wider type than uint8_t, you will need
         * to special-case this line so that if s2_shr is the width of the
         * type, you get lo_bits = 0.  Don't say you weren't warned. */  
        lo_bits = s2[n] >> s2_shr;

        /* XOR with s1[n] to determine bits that differ between s1 and s2 */
        s1s2_diff ^= s1[n];

        /* Look only at differences in the high bits in the first byte */
        if (n == 0)
            s1s2_diff &= mask_hi_bits[8 - s1_off];

        /* Look only at differences in the low bits of the last byte */
        if (n == msb)
            s1s2_diff &= mask_lo_bits[(length + s1_off) % 8];

        if (s1s2_diff)
            return NOT_EQUAL;
    }

    return EQUAL;
}

/* compare_bit_sequence()
 *
 * Adjusts the parameters to match the preconditions for do_compare(), then
 *  calls it to do the work.
 */
int compare_bit_sequence(const uint8_t s1[], unsigned s1_off,
    const uint8_t s2[], unsigned s2_off, unsigned length)
{
    /* Handle length zero */
    if (length == 0)
        return EQUAL;

    /* Makes sure the offsets are less than 8 bits */
    s1 += s1_off / 8;
    s1_off %= 8;

    s2 += s2_off / 8;
    s2_off %= 8;

    /* Make sure s2 is the sequence with the shorter offset */
    if (s1_off >= s2_off)
        return do_compare(s1, s1_off, s2, s2_off, length);
    else
        return do_compare(s2, s2_off, s1, s1_off, length);
}

要在您的示例中进行比较,您可以调用:

compare_bit_sequence(bitarray_1, 13, bitarray_2, 5, 35)

(请注意,我从零开始对位进行编号,并假设位数组以小端序排列,因此这将从 bitarray2[0] 中的第六个最低有效位和第六个最低有效位开始比较bitarray1[1] 中的位)。

于 2009-10-15T23:49:26.290 回答
0

如何编写函数来计算两个数组的偏移量、应用掩码、移位并将结果存储到 int 以便您可以比较它们。如果位数(在您的示例中为 34)超过 int 的长度 - 递归或循环。

对不起,这个例子会很痛苦。

于 2009-10-15T21:29:50.623 回答
0

这是我未优化的位序列比较函数:

#include <stdio.h> 
#include <stdint.h> 
                  //  01234567    01234567
uint8_t bitsA[] = { 0b01000000, 0b00010000 };
uint8_t bitsB[] = { 0b10000000, 0b00100000 };

int bit( uint8_t *bits, size_t bitpoz, size_t len ){
  return (bitpoz<len)? !!(bits[bitpoz/8]&(1<<(7-bitpoz%8))): 0;
}

int bitcmp( uint8_t *bitsA, size_t firstA, size_t lenA,
            uint8_t *bitsB, size_t firstB, size_t lenB ){
  int cmp;
  for( size_t i=0; i<lenA || i<lenB; i++ ){
    if( (cmp = bit(bitsA,firstA+i,firstA+lenA) -
               bit(bitsB,firstB+i,firstB+lenB))  ) return cmp;
  }
  return 0;
}

int main(){
  printf( "cmp: %i\n", bitcmp( bitsA,1,11, bitsB,0,11 ) );
}

编辑:这是我的(未经测试的)位串相等测试函数:

#include <stdlib.h> 
#include <stdint.h> 

#define load_64bit(bits,first) (*(uint64_t*)bits<<first | *(bits+8)>>(8-first)) 
#define load_32bit(bits,first) (*(uint32_t*)bits<<first | *(bits+4)>>(8-first)) 
#define load_16bit(bits,first) (*(uint16_t*)bits<<first | *(bits+2)>>(8-first)) 
#define load_8bit( bits,first) (           *bits<<first | *(bits+1)>>(8-first)) 

static inline uint8_t last_bits( uint8_t *bits, size_t first, size_t size ){
  return (first+size>8?load_8bit(bits,first):*bits<<first)>>(8-size);
}

int biteq( uint8_t *bitsA, size_t firstA,
           uint8_t *bitsB, size_t firstB, size_t size ){
  if( !size ) return 1;
  bitsA+=firstA/8; firstA%=8;
  bitsB+=firstB/8; firstB%=8;

  for(; size>64;size-=64,bitsA+=8,bitsB+=8)
    if(load_64bit(bitsA,firstA)!=load_64bit(bitsB,firstB)) return 0;

  for(; size>32;size-=32,bitsA+=4,bitsB+=4)
    if(load_32bit(bitsA,firstA)!=load_32bit(bitsB,firstB)) return 0;

  for(; size>16;size-=16,bitsA+=2,bitsB+=2)
    if(load_16bit(bitsA,firstA)!=load_16bit(bitsB,firstB)) return 0;

  for(; size> 8;size-= 8,bitsA++, bitsB++ )
    if(load_8bit( bitsA,firstA)!=load_8bit( bitsB,firstB)) return 0;

  return !size || 
         last_bits(bitsA,firstA,size)==last_bits(bitsB,firstB,size);
}

我做了一个简单的测量工具,看看它有多快:

#include <unistd.h> 
#include <stdio.h> 
#include <signal.h> 

#define SIZE 1000000 
uint8_t bitsC[SIZE];

volatile int end_loop;
void sigalrm_hnd( int sig ){ (void)sig; end_loop=1; }

int main(){
  uint64_t loop_count; int cmp;
  signal(SIGALRM,sigalrm_hnd);
  loop_count=0; end_loop=0; alarm(10);
  while( !end_loop ){
    for( int i=1; i<7; i++ ){
      loop_count++;
      cmp = biteq( bitsC,i, bitsC,7-i,(SIZE-1)*8 );
      if( !cmp ){ printf( "cmp: %i (==0)\n", cmp ); return -1; }
    }
  }
  printf( "biteq: %.2f round/sec\n", loop_count/10.0 );
}

结果:

bitcmp:   8.40 round/sec
biteq:  363.60 round/sec

EDIT2:last_bits() 改变了。

于 2009-10-16T00:15:10.197 回答