11

我正在使用constant_tsc和的 cpu 上运行此测试nonstop_tsc

$ grep -m 1 ^flags /proc/cpuinfo | sed 's/ /\n/g' | egrep "constant_tsc|nonstop_tsc"
constant_tsc
nonstop_tsc

第 1 步:计算 tsc 的滴答率:

我计算_ticks_per_ns为多次观察的中位数。我rdtscp用来确保按顺序执行。

static const int trials = 13;
std::array<double, trials> rates;

for (int i = 0; i < trials; ++i)
{
    timespec beg_ts, end_ts;
    uint64_t beg_tsc, end_tsc;

    clock_gettime(CLOCK_MONOTONIC, &beg_ts);
    beg_tsc = rdtscp();

    uint64_t elapsed_ns;
    do
    {
        clock_gettime(CLOCK_MONOTONIC, &end_ts);
        end_tsc = rdtscp();

        elapsed_ns = to_ns(end_ts - beg_ts); // calculates ns between two timespecs
    }
    while (elapsed_ns < 10 * 1e6); // busy spin for 10ms

    rates[i] = (double)(end_tsc - beg_tsc) / (double)elapsed_ns;
}

std::nth_element(rates.begin(), rates.begin() + trials/2, rates.end());

_ticks_per_ns = rates[trials/2];

第 2 步:计算开始挂钟时间和 tsc

uint64_t beg, end;
timespec ts;

// loop to ensure we aren't interrupted between the two tsc reads
while (1)
{
    beg = rdtscp();
    clock_gettime(CLOCK_REALTIME, &ts);
    end = rdtscp();

    if ((end - beg) <= 2000) // max ticks per clock call
        break;
}

_start_tsc        = end;
_start_clock_time = to_ns(ts); // converts timespec to ns since epoch

第 3 步:创建一个可以从 tsc 返回挂钟时间的函数

uint64_t tsc_to_ns(uint64_t tsc)
{
    int64_t diff = tsc - _start_tsc;
    return _start_clock_time + (diff / _ticks_per_ns);
}

第 4 步:循环运行,从clock_gettime和从打印挂钟时间rdtscp

// lock the test to a single core
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(6, &mask);
sched_setaffinity(0, sizeof(cpu_set_t), &mask);

while (1)
{
    timespec utc_now;
    clock_gettime(CLOCK_REALTIME, &utc_now);
    uint64_t utc_ns = to_ns(utc_now);
    uint64_t tsc_ns = tsc_to_ns(rdtscp());

    uint64_t ns_diff = tsc_ns - utc_ns;

    std::cout << "clock_gettime " << ns_to_str(utc_ns) << '\n';
    std::cout << "tsc_time      " << ns_to_str(tsc_ns) << " diff=" << ns_diff << "ns\n";

    sleep(10);
}

输出:

clock_gettime 11:55:34.824419837
tsc_time      11:55:34.824419840 diff=3ns
clock_gettime 11:55:44.826260245
tsc_time      11:55:44.826260736 diff=491ns
clock_gettime 11:55:54.826516358
tsc_time      11:55:54.826517248 diff=890ns
clock_gettime 11:56:04.826683578
tsc_time      11:56:04.826684672 diff=1094ns
clock_gettime 11:56:14.826853056
tsc_time      11:56:14.826854656 diff=1600ns
clock_gettime 11:56:24.827013478
tsc_time      11:56:24.827015424 diff=1946ns

问题:

很明显,以这两种方式计算的时间很快就会分开。

我假设tscconstant_tscnonstop_tsc是恒定的。

  • 这是漂移的车载时钟吗?肯定不会以这种速度漂移吗?

  • 这种漂移的原因是什么?

  • 我可以做些什么来使它们保持同步(除了非常频繁地重新计算_start_tsc_start_clock_time在第 2 步中)?

4

3 回答 3

8

至少在我的机器上,在 OP 中看到的漂移的原因是每 ns 的 TSC 滴答数偏离其原始值_ticks_per_ns. 以下结果来自这台机器:

don@HAL:~/UNIX/OS/3EZPcs/Ch06$ uname -a
Linux HAL 4.4.0-81-generic #104-Ubuntu SMP Wed Jun 14 08:17:06 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
don@HAL:~/UNIX/OS/3EZPcs/Ch06$  cat /sys/devices/system/clocksource/clocksource0/current_clocksource
tsc

cat /proc/cpuinfo节目constant_tscnonstop_tsc旗帜。

每 ns 的 clock_gettime() CLOCK_REALTIME 与时间的样本 TSC Ticks。

可以运行 viewRates.cc 来查看机器上每 ns 的当前 TSC Ticks:

rdtscp.h:

static inline unsigned long rdtscp_start(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("cpuid\n\t"
          "rdtsc\n\t" : "=a" (lo), "=d" (hi)
          :: "%rbx", "%rcx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
}

static inline unsigned long rdtscp_end(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("rdtscp\n\t"
          "mov %%edx, %1\n\t"
          "mov %%eax, %0\n\t"
          "cpuid\n\t"  : "=r" (lo), "=r" (hi)
          :: "%rax", "%rbx", "%rcx", "%rdx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
  }

/*see https://www.intel.com/content/www/us/en/embedded/training/ia-32-ia-64-benchmark-code-execution-paper.html
 */

viewRates.cc:

#include <time.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include "rdtscp.h"
using std::cout;  using std::cerr;  using std::endl;

#define CLOCK CLOCK_REALTIME

uint64_t to_ns(const timespec &ts);   // Converts a struct timespec to ns (since epoch).
void view_ticks_per_ns(int runs =10, int sleep =10);

int main(int argc, char **argv) {
  int runs = 10, sleep = 10;
  if (argc != 1 && argc != 3) {
    cerr << "Usage: " << argv[0] << " [ RUNS SLEEP ] \n";
    exit(1);
  } else if (argc == 3) {
    runs = std::atoi(argv[1]);
    sleep = std::atoi(argv[2]);
  }

  view_ticks_per_ns(runs, sleep); 
}

  void view_ticks_per_ns(int RUNS, int SLEEP) {
// Prints out stream of RUNS tsc ticks per ns, each calculated over a SLEEP secs interval.
  timespec clock_start, clock_end;
  unsigned long tsc1, tsc2, tsc_start, tsc_end;
  unsigned long elapsed_ns, elapsed_ticks;
  double rate; // ticks per ns from each run.

  clock_getres(CLOCK, &clock_start);
  cout <<  "Clock resolution: " << to_ns(clock_start) << "ns\n\n";

  cout << " tsc ticks      " << "ns      " << " tsc ticks per ns\n";
  for (int i = 0; i < RUNS; ++i) {
    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_start);
    tsc2 = rdtscp_end();                      
    tsc_start = (tsc1 + tsc2) / 2;

    sleep(SLEEP);

    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_end);
    tsc2 = rdtscp_end();                     
    tsc_end = (tsc1 + tsc2) / 2;

    elapsed_ticks = tsc_end - tsc_start;
    elapsed_ns = to_ns(clock_end) - to_ns(clock_start);
    rate = static_cast<double>(elapsed_ticks) / elapsed_ns;

    cout << elapsed_ticks << " " << elapsed_ns << " " << std::setprecision(12) << rate << endl;
  } 
}

可以运行 linearExtrapolator.cc 来重新创建 OP 的实验:

线性外推器.cc:

#include <time.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <array>
#include "rdtscp.h"

using std::cout;  using std::endl;  using std::array;

#define CLOCK CLOCK_REALTIME

uint64_t to_ns(const timespec &ts);   // Converts a struct timespec to ns (since epoch).
void set_ticks_per_ns(bool set_rate); // Display or set tsc ticks per ns, _ticks_per_ns.
void get_start();             // Sets the 'start' time point: _start_tsc[in ticks] and _start_clock_time[in ns].
uint64_t tsc_to_ns(uint64_t tsc);     // Convert tsc ticks since _start_tsc to ns (since epoch) linearly using
                                      // _ticks_per_ns with origin(0) at the 'start' point set by get_start().

uint64_t _start_tsc, _start_clock_time; // The 'start' time point as both tsc tick number, start_tsc, and as
                                        // clock_gettime ns since epoch as _start_clock_time.
double _ticks_per_ns;                   // Calibrated in set_ticks_per_ns()

int main() {
  set_ticks_per_ns(true); // Set _ticks_per_ns as the initial TSC ticks per ns.

  uint64_t tsc1, tsc2, tsc_now, tsc_ns, utc_ns;
  int64_t ns_diff;
  bool first_pass{true};
  for (int i = 0; i < 10; ++i) {
    timespec utc_now;
    if (first_pass) {
      get_start(); //Get start time in both ns since epoch (_start_clock_time), and tsc tick number(_start_tsc)
      cout << "_start_clock_time: " <<  _start_clock_time << ", _start_tsc: " << _start_tsc << endl;
      utc_ns = _start_clock_time;
      tsc_ns = tsc_to_ns(_start_tsc);   // == _start_clock_time by definition.
      tsc_now = _start_tsc;
      first_pass = false;
    } else {
      tsc1 = rdtscp_start();
      clock_gettime(CLOCK, &utc_now);
      tsc2 = rdtscp_end();
      tsc_now = (tsc1 + tsc2) / 2;
      tsc_ns = tsc_to_ns(tsc_now);
      utc_ns = to_ns(utc_now);
    }

    ns_diff = tsc_ns - (int64_t)utc_ns;

    cout << "elapsed ns: " << utc_ns - _start_clock_time << ", elapsed ticks: " << tsc_now - _start_tsc 
     << ", ns_diff: " << ns_diff << '\n' << endl;

    set_ticks_per_ns(false);  // Display current TSC ticks per ns (does not alter original _ticks_per_ns).
  }
}

void set_ticks_per_ns(bool set_rate) {
  constexpr int RUNS {1}, SLEEP{10};
  timespec clock_start, clock_end;
  uint64_t tsc1, tsc2, tsc_start, tsc_end;
  uint64_t elapsed_ns[RUNS], elapsed_ticks[RUNS];
  array<double, RUNS> rates; // ticks per ns from each run.

  if (set_rate) {
    clock_getres(CLOCK, &clock_start);
    cout <<  "Clock resolution: " << to_ns(clock_start) << "ns\n";
  }

  for (int i = 0; i < RUNS; ++i) {
    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_start);
    tsc2 = rdtscp_end();                      
    tsc_start = (tsc1 + tsc2) / 2;

    sleep(SLEEP);

    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_end);
    tsc2 = rdtscp_end();                     
    tsc_end = (tsc1 + tsc2) / 2;

    elapsed_ticks[i] = tsc_end - tsc_start;
    elapsed_ns[i] = to_ns(clock_end) - to_ns(clock_start);
    rates[i] = static_cast<double>(elapsed_ticks[i]) / elapsed_ns[i];
  }

  cout << " tsc ticks      " << "ns     " << "tsc ticks per ns" << endl;
  for (int i = 0; i < RUNS; ++i)
    cout << elapsed_ticks[i] << " " << elapsed_ns[i] << " " << std::setprecision(12) << rates[i] << endl;

  if (set_rate)
    _ticks_per_ns = rates[RUNS-1];
}

constexpr uint64_t BILLION {1000000000};

uint64_t to_ns(const timespec &ts) {
  return ts.tv_sec * BILLION + ts.tv_nsec;
}

void get_start() { // Get start time both in tsc ticks as _start_tsc, and in ns since epoch as _start_clock_time
  timespec ts;
  uint64_t beg, end;

// loop to ensure we aren't interrupted between the two tsc reads
  while (1) {
    beg = rdtscp_start();
    clock_gettime(CLOCK, &ts);
    end = rdtscp_end();   
    if ((end - beg) <= 2000) // max ticks per clock call
      break;
  }

  _start_tsc = (end + beg) / 2;
  _start_clock_time = to_ns(ts); // converts timespec to ns since epoch
}

uint64_t tsc_to_ns(uint64_t tsc) { // Convert tsc ticks into absolute ns:
  // Absolute ns is defined by this linear extrapolation from the start point where
  //_start_tsc[in ticks] corresponds to _start_clock_time[in ns].
  uint64_t diff = tsc - _start_tsc;
  return _start_clock_time + static_cast<uint64_t>(diff / _ticks_per_ns);
}

以下是viewRates紧随其后的运行的输出linearExtrapolator

don@HAL:~/UNIX/OS/3EZPcs/Ch06$ ./viewRates 
Clock resolution: 1ns

 tsc ticks      ns       tsc ticks per ns
28070466526 10000176697 2.8069970538
28070500272 10000194599 2.80699540335
28070489661 10000196097 2.80699392179
28070404159 10000170879 2.80699245029
28070464811 10000197285 2.80699110338
28070445753 10000195177 2.80698978932
28070430538 10000194298 2.80698851457
28070427907 10000197673 2.80698730414
28070409903 10000195492 2.80698611597
28070398177 10000195328 2.80698498942
don@HAL:~/UNIX/OS/3EZPcs/Ch06$ ./linearExtrapolator
Clock resolution: 1ns
 tsc ticks      ns     tsc ticks per ns
28070385587 10000197480 2.8069831264
_start_clock_time: 1497966724156422794, _start_tsc: 4758879747559
elapsed ns: 0, elapsed ticks: 0, ns_diff: 0

 tsc ticks      ns     tsc ticks per ns
28070364084 10000193633 2.80698205596
elapsed ns: 10000247486, elapsed ticks: 28070516229, ns_diff: -3465

 tsc ticks      ns     tsc ticks per ns
28070358445 10000195130 2.80698107188
elapsed ns: 20000496849, elapsed ticks: 56141027929, ns_diff: -10419

 tsc ticks      ns     tsc ticks per ns
28070350693 10000195646 2.80698015186
elapsed ns: 30000747550, elapsed ticks: 84211534141, ns_diff: -20667

 tsc ticks      ns     tsc ticks per ns
28070324772 10000189692 2.80697923105
elapsed ns: 40000982325, elapsed ticks: 112281986547, ns_diff: -34158

 tsc ticks      ns     tsc ticks per ns
28070340494 10000198352 2.80697837242
elapsed ns: 50001225563, elapsed ticks: 140352454025, ns_diff: -50742

 tsc ticks      ns     tsc ticks per ns
28070325598 10000196057 2.80697752704
elapsed ns: 60001465937, elapsed ticks: 168422905017, ns_diff: -70335

^C

输出显示每 ns的viewRatesTSC 滴答随着时间的推移而迅速下降,对应于上图中的急剧下降之一。linearExtrapolator与在 OP 中一样,输出显示由 报告的经过的 ns 与通过使用在开始时获得的 == 2.8069831264clock_gettime()将经过的 TSC 滴答声转换为经过的 ns 获得的经过的 ns 之间的差异。_ticks_per_ns我不是在 , , 的每次sleep(10);打印之间进行一次计算elapsed ns,而是使用 10 秒的窗口重新运行每 ns 的 TSC 滴答计算;这将打印出当前比率。可以看出,从输出观察到的每 ns TSC 滴答数减少的趋势在整个运行过程中都在持续。elapsed ticksns_difftsc ticks per nsviewRateslinearExtrapolator

除以和减去相应的得到elapsed ticks,例如:(84211534141 / 2.8069831264) - 30000747550 = -20667。但这不是 0,主要是由于每 ns 的 TSC 滴答数的漂移。如果我们使用从最后 10 秒间隔获得的每 ns 2.80698015186 个滴答的值,则结果将是:(84211534141 / 2.80698015186) - 30000747550 = 11125。在最后 10 秒间隔期间累积的额外误差 -20667 - -10419 = - 10248,当正确的 TSC 每 ns 滴答值用于该间隔时几乎消失:(84211534141 - 56141027929) / 2.80698015186 - (30000747550 - 20000496849) = 349。_ticks_per_nselapsed nsns_diff

如果在每 ns 的 TSC 滴答数恒定时运行了 linearExtrapolator,则准确性将受到(常数)_ticks_per_ns被确定的程度的限制,然后采取例如几个估计的中值是值得的. 如果_ticks_per_ns偏离固定的十亿分之 40,则预计每 10 秒会有大约 400ns 的恒定漂移,因此每 10 秒ns_diff会增长/缩小 400。

genTimeSeriesofRates.cc 可用于为上面的图生成数据:genTimeSeriesofRates.cc:

#include <time.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <array>
#include "rdtscp.h"

using std::cout;  using std::cerr;  using std::endl;  using std::array;

double get_ticks_per_ns(long &ticks, long &ns); // Get median tsc ticks per ns, ticks and ns.
long ts_to_ns(const timespec &ts);

#define CLOCK CLOCK_REALTIME            // clock_gettime() clock to use.
#define TIMESTEP 10
#define NSTEPS  10000
#define RUNS 5            // Number of RUNS and SLEEP interval used for each sample in get_ticks_per_ns().
#define SLEEP 1

int main() {
  timespec ts;
  clock_getres(CLOCK, &ts);
  cerr << "CLOCK resolution: " << ts_to_ns(ts) << "ns\n";

  clock_gettime(CLOCK, &ts);
  int start_time = ts.tv_sec;

  double ticks_per_ns;
  int running_elapsed_time = 0; //approx secs since start_time to center of the sampling done by get_ticks_per_ns()
  long ticks, ns;
  for (int timestep = 0; timestep < NSTEPS; ++timestep) {
    clock_gettime(CLOCK, &ts);
    ticks_per_ns = get_ticks_per_ns(ticks, ns);
    running_elapsed_time = ts.tv_sec - start_time + RUNS * SLEEP / 2;

    cout << running_elapsed_time << ' ' << ticks << ' ' << ns << ' ' 
     << std::setprecision(12) << ticks_per_ns << endl;

    sleep(10);
  }
}

double get_ticks_per_ns(long &ticks, long &ns) {
  // get the median over RUNS runs of elapsed tsc ticks, CLOCK ns, and their ratio over a SLEEP secs time interval 
  timespec clock_start, clock_end;
  long tsc_start, tsc_end;
  array<long, RUNS> elapsed_ns, elapsed_ticks;
  array<double, RUNS> rates; // arrays from each run from which to get medians.

  for (int i = 0; i < RUNS; ++i) {
    clock_gettime(CLOCK, &clock_start);
    tsc_start = rdtscp_end(); // minimizes time between clock_start and tsc_start.
    sleep(SLEEP);
    clock_gettime(CLOCK, &clock_end);
    tsc_end = rdtscp_end();

    elapsed_ticks[i] = tsc_end - tsc_start;
    elapsed_ns[i] = ts_to_ns(clock_end) - ts_to_ns(clock_start);
    rates[i] = static_cast<double>(elapsed_ticks[i]) / elapsed_ns[i];
  }

  // get medians:
  std::nth_element(elapsed_ns.begin(), elapsed_ns.begin() + RUNS/2, elapsed_ns.end());
  std::nth_element(elapsed_ticks.begin(), elapsed_ticks.begin() + RUNS/2, elapsed_ticks.end());
  std::nth_element(rates.begin(), rates.begin() + RUNS/2, rates.end());
  ticks = elapsed_ticks[RUNS/2];
  ns = elapsed_ns[RUNS/2];

  return rates[RUNS/2];
}

constexpr long BILLION {1000000000};

long ts_to_ns(const timespec &ts) {
  return ts.tv_sec * BILLION + ts.tv_nsec;
}
于 2017-06-20T18:48:29.640 回答
4

TSC 和类似的东西之间的关系CLOCK_MONOTONIC不会完全改变。即使您对 TSC 进行“校准” CLOCK_MONOTONIC,校准几乎在完成后就会过期!

他们不会长期保持同步的原因:

  1. CLOCK_MONOTONIC受 NTP 时钟频率调整的影响。NTP 将不断检查网络时间并巧妙地减慢或加快系统时钟以匹配网络时间。这会导致真实频率出现某种振荡模式CLOCK_MONOTONIC,因此您的校准将始终略微偏离,尤其是在下一次 NTP 应用速率调整时。您可以进行比较CLOCK_MONOTONIC_RAW以消除这种影响。
  2. CLOCK_MONOTONIC和 TSC 几乎可以肯定是基于完全不同的底层振荡器。人们常说现代操作系统使用 TSC 进行计时,但这只是对其他一些底层运行缓慢的时钟应用一个小的“本地”偏移量以提供非常精确的时间(例如,“慢时间”可能每个计时器滴答都会更新,然后 TSC 用于在计时器滴答之间进行插值)。决定CLOCK_MONOTONIC. 然而,TSC 本身是一个独立的自由运行时钟,它的频率来自不同的振荡器,位于芯片组/主板的不同位置,并且会产生不同的自然波动(特别是对温度变化的不同响应)。

是 (2) 是上述两个中更基本的:这意味着即使没有任何类型的 NTP 调整(或者如果您使用不受它们影响的时钟),如果基础时钟基于不同的物理振荡器。

于 2018-08-19T15:25:35.310 回答
0

这是漂移的车载时钟吗?肯定不会以这种速度漂移吗?
不,他们不应该漂移

这种漂移的原因是什么?
运行您的操作系统的 NTP 服务或类似服务。它们影响 clock_gettime(CLOCK_REALTIME, ...);

我可以做些什么来使它们保持同步(除了在步骤 2 中非常频繁地重新计算 _start_tsc 和 _start_clock_time 之外)?是的,您可以缓解问题。

1 您可以尝试使用 CLOCK_MONOTONIC 代替 CLOCK_REALTIME。

2 您可以将差异计算为时间的线性函数,并将其应用于补偿漂移。但它不会很可靠,因为时间服务不会将时间调整为线性函数。但它会给你更多的准确性。您可以定期进行重新调整。


由于您计算 _ticks_per_ns 不准确,您可能会得到一些漂移。您可以通过多次运行程序来检查它。如果结果不可重现,则意味着您计算的 _ticks_per_ns 不正确。最好使用统计方法而不是平均值。


另请注意,_ticks_per_ns 您正在使用与 TSC 相关的 CLOCK_MONOTONIC 进行计算。

接下来,您将使用 CLOCK_REALTIME。它提供系统时间。如果您的系统有 NTP 或类似服务,时间会有所调整。

您的差异约为每分钟 2 微秒。它是每天 0.002 * 24*60 = 2.9 毫秒。CPU时钟的精度很高。每天 3 毫秒是一年 1 秒。

图片

于 2017-03-29T22:11:31.023 回答