8

假设类似:

void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  for(i=0; i<len; i++)
  {
     dest[i] = src[i] & mask[i];
  }
}

通过编写如下内容,我可以在非对齐访问机器(例如 x86)上更快地运行:

void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  unsigned int wordlen = len >> 2;
  for(i=0; i<wordlen; i++)
  {
    ((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; // this raises SIGBUS on SPARC and other archs that require aligned access.
  }
  for(i=wordlen<<2; i<len; i++){
    dest[i] = src[i] & mask[i];
  }
}

但是它需要建立在几个架构上,所以我想做类似的事情:

void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  unsigned int wordlen = len >> 2;

#if defined(__ALIGNED2__) || defined(__ALIGNED4__) || defined(__ALIGNED8__)
  // go slow
  for(i=0; i<len; i++)
  {
     dest[i] = src[i] & mask[i];
  }
#else
  // go fast
  for(i=0; i<wordlen; i++)
  {
    // the following line will raise SIGBUS on SPARC and other archs that require aligned access.
    ((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; 
  }
  for(i=wordlen<<2; i<len; i++){
    dest[i] = src[i] & mask[i];
  }
#endif
}

但是我找不到任何关于编译器定义的宏(如我__ALIGNED4__上面的假设)指定对齐或使用预处理器确定目标体系结构对齐的任何巧妙方法的任何好的信息。我可以测试defined (__SVR4) && defined (__sun),但我更喜欢 Just Work TM在其他需要对齐内存访问的架构上的东西。

4

3 回答 3

6

虽然 x86 默默地修复了未对齐的访问,但这对于性能来说并不是最优的。通常最好假设一定的对齐并自己执行修复:

unsigned int const alignment = 8;   /* or 16, or sizeof(long) */

void memcpy(char *dst, char const *src, unsigned int size) {
    if((((intptr_t)dst) % alignment) != (((intptr_t)src) % alignment)) {
        /* no common alignment, copy as bytes or shift around */
    } else {
        if(((intptr_t)dst) % alignment) {
            /* copy bytes at the beginning */
        }
        /* copy words in the middle */
        if(((intptr_t)dst + size) % alignment) {
            /* copy bytes at the end */
        }
    }
}

另外,请查看 SIMD 指令。

于 2011-12-07T16:03:07.563 回答
2

标准方法是使用一个configure脚本来运行程序来测试对齐问题。如果测试程序没有崩溃,则配置脚本会在生成的配置标头中定义一个宏,以加快实现速度。更安全的实现是默认设置。

void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  unsigned int wordlen = len >> 2;

#if defined(UNALIGNED)
  // go fast
  for(i=0; i<wordlen; i++)
  {
    // the following line will raise SIGBUS on SPARC and other archs that require aligned access.
    ((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; 
  }
  for(i=wordlen<<2; i<len; i++){
    dest[i] = src[i] & mask[i];
  }
#else
  // go slow
  for(i=0; i<len; i++)
  {
     dest[i] = src[i] & mask[i];
  }
#endif
}
于 2011-12-07T16:04:35.357 回答
1

(我觉得很奇怪你有src这些mask通勤时间。我重命名mask_bytesmemand。但无论如何......)

另一种选择是使用利用 C 中类型的不同函数。例如:

void memand_bytes(char *dest, char *src1, char *src2, size_t len)
{
    unsigned int i;
    for (i = 0; i < len; i++)
        dest[i] = src1[i] & src2[i];
}

void memand_ints(int *dest, int *src1, int *src2, size_t len)
{
    unsigned int i;
    for (i = 0; i < len; i++)
        dest[i] = src1[i] & src2[i];
}

这样你就让程序员决定了。

于 2011-12-07T16:53:16.243 回答