I used a similar timing code, except I have the parent loop 1000000 times, and time the whole loop in both the parent and child. The code is attached. Then I modified it to time the individual context switches, as in you pseudo-code, summed the 1000000 individual times and got good agreement with my original code. So either way seems to work, given the caveats already mentioned.
The thing I find interesting is that the context switching time is more than doubled when sched_setaffinity()
is used to set the parent and child to run on separate cpus. Why does that affect the time in that way? Is the pipe faster between processes running on same cpu?
rdtscp.h:
static inline unsigned long rdtscp_start(void) {
unsigned long var;
unsigned int hi, lo;
__asm volatile ("cpuid\n\t"
"rdtsc\n\t" : "=a" (lo), "=d" (hi)
:: "%rbx", "%rcx");
var = ((unsigned long)hi << 32) | lo;
return (var);
}
static inline unsigned long rdtscp_end(void) {
unsigned long var;
unsigned int hi, lo;
__asm volatile ("rdtscp\n\t"
"mov %%edx, %1\n\t"
"mov %%eax, %0\n\t"
"cpuid\n\t" : "=r" (lo), "=r" (hi)
:: "%rax", "%rbx", "%rcx", "%rdx");
var = ((unsigned long)hi << 32) | lo;
return (var);
}
/*see https://www.intel.com/content/www/us/en/embedded/training/ia-32-ia-64-benchmark-code-execution-paper.html
*/
cntxtSwtchr.c:
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "rdtscp.h"
int main() {
int pipe1[2], pipe2[2];
pipe(pipe1) || pipe(pipe2);
cpu_set_t set;
CPU_ZERO(&set);
clock_t tick, tock;
int fork_rtn;
if ((fork_rtn = fork()) < 0)
exit(1);
if (fork_rtn == 0) { // Child
close(pipe1[1]);
close(pipe2[0]);
CPU_SET(1, &set);
sched_setaffinity(0, sizeof(set), &set);
tick = clock();
unsigned long tsc_start = rdtscp_start();
int i;
while (read(pipe1[0], &i, 4))
write(pipe2[1], &i, 4);
printf("child tsc_ticks: %lu\n", rdtscp_end() - tsc_start);
tock = clock();
clock_t ticks = tock - tick;
double dt = (double)ticks / CLOCKS_PER_SEC;
printf("Elapsed child cpu time: %gs.\n", dt);
close(pipe1[0]);
close(pipe2[1]);
exit(0);
} else { // Parent
close(pipe1[0]);
close(pipe2[1]);
CPU_SET(1, &set);
sched_setaffinity(0, sizeof(set), &set);
int idx, lim = 1000000;
int i_rtnd;
tick = clock();
unsigned long tsc_start = rdtscp_start();
for (idx = 0; idx < lim; ++idx) {
write(pipe1[1], &idx, 4);
read(pipe2[0], &i_rtnd, 4);
if (i_rtnd != idx)
break;
}
printf("parent tsc_ticks: %lu\n", rdtscp_end() - tsc_start);
tock = clock();
clock_t ticks = tock - tick;
double dt = (double)ticks / CLOCKS_PER_SEC;
printf("Elapsed parent cpu time: %gs, %gs/switch.\n", dt, dt / lim);
if (idx == lim)
printf("Parent reached end of processing loop.\n");
else
printf("Parent failed to reach end of processing loop.\n");
close(pipe1[1]);
close(pipe2[0]);
exit(0);
}
}