(TL;DR) 在 NVME SSD(Intel p3600 和 Avant)上,如果我在磁盘的一小部分而不是整个磁盘上发出随机读取,我会看到 IOPS 下降。
在一遍又一遍地读取相同的偏移量时,对于 4k 块大小,IOPS 约为 36-40K。随着随机读取区域的扩大,IOPS 逐渐增加。该程序(见下文)在 Linux 上使用异步 IO 来提交读取请求。
Disk Range(in 4k blocks), IOPS
0, 38833
1, 68596
10, 76100
30, 80381
40, 113647
50, 148205
100, 170374
200, 239798
400, 270197
800, 334767
操作系统:Linux 4.2.0-35-generic
固态硬盘:英特尔 P3600 NVME 闪存
什么可能导致这个问题?
程序可以运行如下
$ for i in 0 1 10 30 40 50 100 200 400 800
do
<program_name> /dev/nvme0n1 10 $i
done
并验证您是否还看到上面看到的 IOPS 增加模式
/**
* $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
* $ progname /dev/nvme0n1 10 100
*/
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>
io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;
constexpr int numPerRound = 20;
constexpr int numRounds = 100000;
constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;
off_t startBlock = 0;
off_t numBlocks = 100;
const int numSubmitted = numRounds * numPerRound;
void DoGet()
{
io_event eventsArray[MAXEVENT];
int numCompleted = 0;
while (numCompleted != numSubmitted)
{
bzero(eventsArray, MAXEVENT * sizeof(io_event));
int numEvents;
do {
numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
} while (numEvents == -EINTR);
for (int i = 0; i < numEvents; i++)
{
io_event* ev = &eventsArray[i];
iocb* cb = (iocb*)(ev->data);
assert(ev->res2 == 0);
assert(ev->res == BLKSIZE);
sem_post(&sem); // free ioctx
}
numCompleted += numEvents;
}
std::cout << "completed=" << numCompleted << std::endl;
}
int main(int argc, char* argv[])
{
if (argc == 1) {
std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;
exit(1);
}
char* deviceName = argv[1];
startBlock = atoll(argv[2]);
numBlocks = atoll(argv[3]);
int ret = 0;
ret = io_queue_init(QDEPTH, &ioctx);
assert(ret == 0);
ret = sem_init(&sem, 0, QDEPTH);
assert(ret == 0);
auto DoGetFut = std::async(std::launch::async, DoGet);
// preallocate buffers
for (int i = 0; i < QDEPTH; i++)
{
char* buf ;
ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
assert(ret == 0);
buffers.push_back(buf);
}
fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
assert(fd >= 0);
off_t offset = 0;
struct timeval start;
gettimeofday(&start, 0);
std::mt19937 generator (getpid());
// generate random offsets within [startBlock, startBlock + numBlocks]
std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);
for (int j = 0; j < numRounds; j++)
{
iocb mycb[numPerRound];
iocb* posted[numPerRound];
bzero(mycb, sizeof(iocb) * numPerRound);
for (int i = 0; i < numPerRound; i++)
{
// same buffer may get used in 2 different async read
// thats ok - not validating content in this program
char* iobuf = buffers[i];
iocb* cb = &mycb[i];
offset = offsetgen(generator) * BLKSIZE;
io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
cb->data = iobuf;
posted[i] = cb;
sem_wait(&sem); // wait for ioctx to be free
}
int ret = 0;
do {
ret = io_submit(ioctx, numPerRound, posted);
} while (ret == -EINTR);
assert(ret == numPerRound);
}
DoGetFut.wait();
struct timeval end;
gettimeofday(&end, 0);
uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);
io_queue_release(ioctx);
std::cout
<< "ops=" << numRounds * numPerRound
<< " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
<< " region-size=" << (numBlocks * BLKSIZE)
<< std::endl;
}