1

我正在尝试编写一个程序来测量上下文切换。我已经阅读了英特尔关于 rdtsc + rdtscp 指令的手册。

现在,我想在上下文切换中使用这些时间戳指令。我的一般骨架如下:

// init two pipes P1, P2
fork();

set_affinity();              // same core

// parent's code:
    cpuid + rdtsc            // start timer
    write(timer to P1);

    read(timer from P2);     // blocks parent if timer value not written
    rdtscp + cpuid           // stop timer, get difference

// child's code:
    read(timer from P1);     // blocks child if timer value not written
    rdtscp + cpuid           // stop timer, get difference

    cpuid + rdtsc            // start timer
    write(timer to P2);

我在这段代码中看到了一些问题。假设定时器操作是正确的,

如果操作系统选择上下文切换到某个完全不同的进程(不是子进程或父进程),它将无法工作。

此代码还将包括 read() 和 write() 系统调用所花费的时间。

忽略这些问题,是否有效使用 rdtsc + rdtscp 指令?

I know writing a kernel module and disabling preemption/interrupts is a better way
4

2 回答 2

2

我以前做过,这似乎是测量上下文切换时间的有效方法。每当对这种细粒度的事情进行计时,调度的不可预测性总是会发挥作用;通常,您通过测量数千次并寻找最小值、媒体或平均时间间隔等数字来处理这个问题。您可以通过以实时SCHED_FIFO优先级运行这两个进程来减少调度问题。如果您想知道实际的切换时间(在单个 cpu 核心上),您需要将两个进程绑定到具有关联设置的单个 cpu。如果您只想知道一个进程能够响应另一个进程的输出的延迟,那么让它们在不同的 cpu 上运行就可以了。

要记住的另一个问题是自愿和非自愿上下文切换,以及从用户空间和内核空间开始的切换具有不同的成本。你的很可能是自愿的。测量非自愿更难,并且需要从繁忙的循环或类似的循环中探查共享内存。

于 2017-04-19T21:53:56.253 回答
1

I used a similar timing code, except I have the parent loop 1000000 times, and time the whole loop in both the parent and child. The code is attached. Then I modified it to time the individual context switches, as in you pseudo-code, summed the 1000000 individual times and got good agreement with my original code. So either way seems to work, given the caveats already mentioned.

The thing I find interesting is that the context switching time is more than doubled when sched_setaffinity() is used to set the parent and child to run on separate cpus. Why does that affect the time in that way? Is the pipe faster between processes running on same cpu?

rdtscp.h:

static inline unsigned long rdtscp_start(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("cpuid\n\t"
          "rdtsc\n\t" : "=a" (lo), "=d" (hi)
          :: "%rbx", "%rcx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
}

static inline unsigned long rdtscp_end(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("rdtscp\n\t"
          "mov %%edx, %1\n\t"
          "mov %%eax, %0\n\t"
          "cpuid\n\t"  : "=r" (lo), "=r" (hi)
          :: "%rax", "%rbx", "%rcx", "%rdx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
  }

/*see https://www.intel.com/content/www/us/en/embedded/training/ia-32-ia-64-benchmark-code-execution-paper.html
 */

cntxtSwtchr.c:

#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "rdtscp.h"

int main() {
  int pipe1[2], pipe2[2];
  pipe(pipe1) || pipe(pipe2);
  cpu_set_t set;
  CPU_ZERO(&set);

  clock_t tick, tock;

  int fork_rtn;
  if ((fork_rtn = fork()) < 0)
    exit(1);

  if (fork_rtn == 0) {  // Child
    close(pipe1[1]);
    close(pipe2[0]);

    CPU_SET(1, &set);
    sched_setaffinity(0, sizeof(set), &set);

    tick = clock();
    unsigned long tsc_start = rdtscp_start();
    int i;
    while (read(pipe1[0], &i, 4)) 
      write(pipe2[1], &i, 4);
    printf("child tsc_ticks: %lu\n", rdtscp_end() - tsc_start);
    tock = clock();
    clock_t ticks = tock - tick;
    double dt = (double)ticks / CLOCKS_PER_SEC;
    printf("Elapsed child cpu time: %gs.\n", dt); 

    close(pipe1[0]);
    close(pipe2[1]);
    exit(0);

  } else {              // Parent
    close(pipe1[0]);
    close(pipe2[1]);

    CPU_SET(1, &set);
    sched_setaffinity(0, sizeof(set), &set);

    int idx, lim = 1000000;
    int i_rtnd;
    tick = clock();
    unsigned long tsc_start = rdtscp_start();
    for (idx = 0; idx < lim; ++idx) {
      write(pipe1[1], &idx, 4);
      read(pipe2[0], &i_rtnd, 4);
      if (i_rtnd != idx) 
    break;
    }
    printf("parent tsc_ticks: %lu\n", rdtscp_end() - tsc_start);
    tock = clock();
    clock_t ticks = tock - tick;
    double dt = (double)ticks / CLOCKS_PER_SEC;
    printf("Elapsed parent cpu time: %gs, %gs/switch.\n", dt, dt / lim); 
    if (idx == lim)
      printf("Parent reached end of processing loop.\n");
    else
      printf("Parent failed to reach end of processing loop.\n");

    close(pipe1[1]);
    close(pipe2[0]);
    exit(0);
  }

}
于 2017-06-18T20:25:57.990 回答