(TL;DR) 在 NVME SSD(Intel p3600 和 Avant)上,如果我在磁盘的一小部分而不是整个磁盘上发出随机读取,我会看到 IOPS 下降。

在一遍又一遍地读取相同的偏移量时,对于 4k 块大小,IOPS 约为 36-40K。随着随机读取区域的扩大,IOPS 逐渐增加。该程序(见下文)在 Linux 上使用异步 IO 来提交读取请求。

Disk Range(in 4k blocks), IOPS 
0, 38833 
1, 68596 
10, 76100 
30, 80381 
40, 113647 
50, 148205 
100, 170374 
200, 239798 
400, 270197 
800, 334767

操作系统:Linux 4.2.0-35-generic

固态硬盘:英特尔 P3600 NVME 闪存



$ for i in 0 1 10 30 40 50 100 200 400 800
<program_name> /dev/nvme0n1 10 $i 

并验证您是否还看到上面看到的 IOPS 增加模式

 * $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
 * $ progname /dev/nvme0n1 10 100
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>

io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;

constexpr int numPerRound = 20;
constexpr int numRounds  = 100000;

constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;

off_t startBlock = 0;
off_t numBlocks = 100;

const int numSubmitted = numRounds * numPerRound;

void DoGet()
  io_event eventsArray[MAXEVENT];
  int numCompleted = 0;
  while (numCompleted != numSubmitted)
    bzero(eventsArray, MAXEVENT * sizeof(io_event));
    int numEvents;
    do {
      numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
    } while (numEvents == -EINTR);

    for (int i = 0; i < numEvents; i++)
      io_event* ev = &eventsArray[i];
      iocb* cb = (iocb*)(ev->data);
      assert(ev->res2 == 0);
      assert(ev->res == BLKSIZE);
      sem_post(&sem); // free ioctx
    numCompleted += numEvents;
  std::cout << "completed=" << numCompleted << std::endl;

int main(int argc, char* argv[])
  if (argc == 1) {
    std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;

  char* deviceName = argv[1];
  startBlock = atoll(argv[2]);
  numBlocks = atoll(argv[3]);

  int ret = 0;
  ret = io_queue_init(QDEPTH, &ioctx);
  assert(ret == 0);
  ret = sem_init(&sem, 0, QDEPTH);
 assert(ret == 0);

  auto DoGetFut = std::async(std::launch::async, DoGet);

  // preallocate buffers
  for (int i = 0; i < QDEPTH; i++)
    char* buf ;
    ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
    assert(ret == 0);

  fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
  assert(fd >= 0);

  off_t offset = 0;

  struct timeval start;
  gettimeofday(&start, 0);

  std::mt19937 generator (getpid());
  // generate random offsets within [startBlock, startBlock + numBlocks]
  std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);

  for (int j = 0; j < numRounds; j++)
    iocb mycb[numPerRound];
    iocb* posted[numPerRound];

    bzero(mycb, sizeof(iocb) * numPerRound);

    for (int i = 0; i < numPerRound; i++)
      // same buffer may get used in 2 different async read
      // thats ok - not validating content in this program
      char* iobuf = buffers[i];
      iocb* cb = &mycb[i];

       offset = offsetgen(generator) * BLKSIZE;

      io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
      cb->data = iobuf;
      posted[i] = cb;
      sem_wait(&sem); // wait for ioctx to be free

    int ret = 0;
    do {
      ret = io_submit(ioctx, numPerRound, posted);
    } while (ret == -EINTR);

    assert(ret == numPerRound);


  struct timeval end;
  gettimeofday(&end, 0);

  uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);


    << "ops=" << numRounds * numPerRound
    << " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
    << " region-size=" << (numBlocks * BLKSIZE)
    << std::endl;

