c - 为什么在 Linux 中访问内存对齐缓冲区更昂贵？

Question

在下面的程序中，我有 2 个缓冲区，一个是 64 字节对齐的，另一个是 16 字节对齐的，在运行 2.6.x 内核的 64 位 Linux 主机上。

高速缓存行长 64 字节。所以，在这个程序中，我一次只访问一个缓存行。posix_memaligned如果不比非对齐缓冲区快，我希望看到它是相等的。以下是一些指标

./readMemory 10000000

time taken by posix_memaligned buffer: 293020299 
time taken by standard buffer: 119724294 

./readMemory 100000000

time taken by posix_memaligned buffer: 548849137 
time taken by standard buffer: 211197082

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <linux/time.h>

void now(struct timespec * t);

int main(int argc, char **argv)
{        
  char *buf;        
  struct timespec st_time, end_time;        
  int runs;        
  if (argc !=2) 
  {
             printf("Usage: ./readMemory <number of runs>\n");                
             exit(1);        
  }        
  errno = 0;        
  runs = strtol(argv[1], NULL, 10);        
  if (errno !=0)        {
            printf("Invalid number of runs: %s \n", argv[1]);
            exit(1);
    }

    int returnVal = -1;

    returnVal = posix_memalign((void **)&buf, 64, 1024);
    if (returnVal != 0)
    {
            printf("error in posix_memaligh\n");
    }

    char tempBuf[64];
    char * temp = buf;

    size_t cpyBytes = 64;

    now(&st_time);
    for(int x=0; x<runs; x++) {
    temp = buf;
    for(int i=0; i < ((1024/64) -1); i+=64)
    {
            memcpy(tempBuf, temp, cpyBytes);
            temp += 64;
    }
    }
    now(&end_time);

    printf("time taken by posix_memaligned buffer: %ld \n", (end_time.tv_nsec - st_time.tv_nsec));

    char buf1[1024];        
    temp = buf1;        
    now(&st_time);        
    for(int x=0; x<runs; x++) 
    {        
      temp = buf1;        
      for(int i=0; i < ((1024/64) -1); i+=64)        
     {                
        memcpy(tempBuf, temp, cpyBytes);                
        temp += 64;        
      }          
    }        
    now(&end_time);        
    printf("time taken by standard buffer: %ld \n", (end_time.tv_nsec - st_time.tv_nsec));
    return 0;
}

void now(struct timespec *tnow)
{
    if(clock_gettime(CLOCK_MONOTONIC_RAW, tnow) <0 )
    {
            printf("error getting time");
            exit(1);
    }
}

第一个循环的反汇编是

    movq    -40(%rbp), %rdx        
    movq    -48(%rbp), %rcx        
    leaq    -176(%rbp), %rax
    movq    %rcx, %rsi
    movq    %rax, %rdi
    call    memcpy
    addq    $64, -48(%rbp)
    addl    $64, -20(%rbp)

第二个循环的拆卸是

    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rcx
    leaq    -176(%rbp), %rax
    movq    %rcx, %rsi
    movq    %rax, %rdi
    call    memcpy
    addq    $64, -48(%rbp)
    addl    $64, -4(%rbp)

score 1 · Accepted Answer

您的基准测试存在一些问题：

您的运行时间太短，因此您可能会看到很多噪音/抖动。
如果您启用了 CPU 频率缩放，则第一个循环可能在 CPU 切换到全频/涡轮频率之前执行。您需要先预热 CPU，或者最好在基准测试期间关闭频率缩放。
您可能正在观察调度，因为您没有以实时优先级运行。
每次运行您只得到一个样本，您至少需要运行 30 次才能做出任何科学判断（只有一个样本的科学研究通常称为轶事）。

score 1 · Accepted Answer

原因可能是缓冲区的相对对齐。

memcpy复制字对齐数据（32/64 位）时工作速度最快。
如果两个缓冲区对齐良好，则一切正常。
如果两个缓冲区以相同的方式未对齐，memcpy则通过逐字节复制小前缀来处理它，然后逐字运行剩余部分。

但是，如果一个缓冲区是字对齐的，而另一个不是，则无法同时进行读取和写入字对齐。所以memcpy仍然一个字一个字地工作，但是一半的内存访问是严重对齐的。

如果您的两个堆栈缓冲区都以相同的方式未对齐（例如，两个地址都是 8*x+2），但是来自的缓冲区posix_memalign是对齐的，它可以解释您所看到的。

score 0 · Accepted Answer

当我交换测量块时——也就是说，首先和posix_memalign第二次运行标准缓冲液测量，我得到完全相反的结果。换句话说，我的 CPU（英特尔酷睿 2）的第一个复制循环几乎总是比第二个慢，无论它们如何对齐。

我尝试使用malloc()标准缓冲区而不是将其放在堆栈上 - 它几乎对速度没有任何影响，第一个循环仍然总是较慢。

我还尝试了posix_memalign()您的小型 64 字节缓冲区 - 它没有任何区别。

编辑：我已经编辑了您的代码以进行 3 次测量：posix 对齐、malloc'ed 和堆栈上的缓冲区（请参见下面的代码）。

事实证明，只有第一个循环很慢。任何后续循环都花费几乎完全相同的时间（有一些小噪音）。

我相信我们正在观察 Linux 调度程序在看到 100% CPU 负载后立即提高 CPU 时钟速度。

我的跑步结果：

$ ./readmemory 2000000 5
time taken by posix aligned: 19599140
time taken by std malloc   : 14711350
time taken by std on stack : 14680668
time taken by posix aligned: 14729273
time taken by std malloc   : 14685338
time taken by std on stack : 14839183
time taken by posix aligned: 14709836
time taken by std malloc   : 15551900
time taken by std on stack : 14659350
time taken by posix aligned: 14721298
time taken by std malloc   : 14691732
time taken by std on stack : 14691246
time taken by posix aligned: 14722127
time taken by std malloc   : 15538286
time taken by std on stack : 14723657

更新代码：

// compile with: g++ readmemory.c -o readmemory -lrt

#include <time.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUF_SIZE 1024
#define COPY_BYTES 64

void now(struct timespec *tnow) {
    if (clock_gettime(CLOCK_MONOTONIC, tnow) < 0) {
        printf("error getting time");
        exit(1);
    }
}

void measure(char * buf, int runs, const char * msg) {
    char tempBuf[64];
    struct timespec st_time, end_time;
    char * temp;
    now(&st_time);
    for (int x=0; x<runs; x++) {
        temp = buf;
        for (int i=0; i < ((BUF_SIZE/COPY_BYTES) - 1); i+=COPY_BYTES) {
            memcpy(tempBuf, temp, COPY_BYTES);
            temp += COPY_BYTES;
        }
    }
    now(&end_time);
    printf("time taken by %s: %ld\n", msg, end_time.tv_nsec - st_time.tv_nsec);
}

int main(int argc, char **argv) {
    char * buf1;         // posix_memalign'ed
    char * buf2;         // malloc'ed
    char buf3[BUF_SIZE]; // alloc on stack
    int rc = -1;
    int runs;
    int loops;
    if (argc != 3) {
        printf("Usage: ./readMemory <runs> <loops>\n");
        exit(1);
    }
    errno = 0;
    runs    = strtol(argv[1], NULL, 0);
    if (errno != 0) {
        printf("Invalid number of runs: %s \n", argv[1]);
        exit(1);
    }
    loops = strtol(argv[2], NULL, 0);

    rc = posix_memalign((void **)&buf1, COPY_BYTES, BUF_SIZE);
    if (rc != 0) {
        printf("error in posix_memalign\n");
        exit(1);
    }
    buf2 = (char *) malloc(BUF_SIZE);
    if (buf2 == NULL) {
        printf("error in malloc\n");
        exit(1);
    }

    for (int i=0; i<loops; i++) {
        measure(buf1, runs, "posix aligned");
        measure(buf2, runs, "std malloc   ");
        measure(buf3, runs, "std on stack ");
    }

    return 0;
}

我认为我们正在观察现代 CPU 实现缓存的相当复杂的方式。

c - 为什么在 Linux 中访问内存对齐缓冲区更昂贵？

3 回答 3

Related

Reference